/**
 * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 * This file is a part of the CANN Open Software.
 * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file matmul_impl.h
 * \brief
 */
#ifndef IMPL_MATMUL_MATMUL_IMPL_H
#define IMPL_MATMUL_MATMUL_IMPL_H
#include "../../impl/matmul/matmul_utils.h"
#include "../../impl/matmul/modules/matmul_policy.h"
#include "../../impl/matmul/modules/matmul_private_modules.h"
#include "../../impl/matmul/modules/matmul_module.h"
#include "../../impl/matmul/modules/matmul_param.h"
#include "../../impl/matmul/matmul_macro_def.h"
namespace AscendC {
namespace Impl {
constexpr int32_t DOUBLE_SIZE = 2;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG = CFG_NORM,
class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
class MatmulImplBase
: MATMUL_IMPORT_MODULE(Context)
, MATMUL_IMPORT_MODULE(CubeOutBuffer)
, MATMUL_IMPORT_MODULE(CopyCubeOut)
, MATMUL_IMPORT_MODULE(CopyCubeInA)
, MATMUL_IMPORT_MODULE(CopyCubeInB)
, MATMUL_IMPORT_MODULE(CubeInBufferA)
, MATMUL_IMPORT_MODULE(CubeInBufferB)
, MATMUL_IMPORT_MODULE(MLoop)
, MATMUL_IMPORT_MODULE(NLoop)
, MATMUL_IMPORT_MODULE(KLoop)
, MATMUL_IMPORT_MODULE(Scheduler)
, MATMUL_IMPORT_MODULE(BiasScheduler)
, MATMUL_IMPORT_MODULE(BatchScheduler)
, MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsA)
, MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsB)
, MATMUL_IMPORT_MODULE_PRIVATE(DataCopyUtilsA)
, MATMUL_IMPORT_MODULE_PRIVATE(DataCopyUtilsB)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchDataCopyUtilsA)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchDataCopyUtilsB)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchCopyCubeInParamsA)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchCopyCubeInParamsB)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulQuantProcessor)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfo)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoA)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulSubBlockInfo)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchCopyCubeInA)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchCopyCubeInB)
, MATMUL_IMPORT_MODULE_PRIVATE(IterateController)
, MATMUL_IMPORT_MODULE_PRIVATE(LocalWorkspace)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulUserDefineInfo)
, MATMUL_IMPORT_MODULE_PRIVATE(LoadToA2)
, MATMUL_IMPORT_MODULE_PRIVATE(LoadToB2)
, MATMUL_IMPORT_MODULE_PRIVATE(TBufPoolL0)
, MATMUL_IMPORT_MODULE_PRIVATE(MmadCompute)
, MATMUL_IMPORT_MODULE_PRIVATE(CopyBiasIn)
, MATMUL_IMPORT_MODULE_PRIVATE(LoadBias2C2)
, MATMUL_IMPORT_MODULE_PRIVATE(C1Buffer)
, MATMUL_IMPORT_MODULE_PRIVATE(C2Buffer)
, MATMUL_IMPORT_MODULE_PRIVATE(BatchLoop)
, MATMUL_IMPORT_MODULE_PRIVATE(MatmulUnitFlag)
#if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 300 || __CCE_AICORE__ == 200
, MatmulMacroImpl<MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY_TEMPLATE>,
 A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS
#endif
{
public:
    using AType = A_TYPE;
    using BType = B_TYPE;
    using CType = C_TYPE;
    using BiasType = BIAS_TYPE;
private:
    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
    using SrcT = typename A_TYPE::T;
    using SrcAT = typename A_TYPE::T;
    using SrcBT = typename B_TYPE::T;
    using DstT = typename C_TYPE::T;
    using BiasT = typename BIAS_TYPE::T;

#if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 300 || __CCE_AICORE__ == 200
    using MatmulInstr = typename MatmulMacroImpl<MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY_TEMPLATE>,
    A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS;
#endif

public:
    __aicore__ inline MatmulImplBase() {};
    __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe = nullptr);
    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK);
    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0);
    __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK);
    __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1);
    __aicore__ inline void SetTensorA(const GlobalTensor<SrcAT>& gm, bool isTransposeA = false);
    __aicore__ inline void SetTensorB(const GlobalTensor<SrcBT>& gm, bool isTransposeB = false);
    __aicore__ inline void SetBias(const GlobalTensor<BiasT>& biasGlobal);
    __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr);
    __aicore__ inline void SetSparseIndex(const GlobalTensor<uint8_t>& indexGlobal);
    __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr);
    __aicore__ inline void SetAntiQuantScalar(const SrcT offsetScalar, const SrcT scaleScalar);
    __aicore__ inline void SetAntiQuantVector(const LocalTensor<SrcT> &offsetTensor,
        const LocalTensor<SrcT> &scaleTensor);
    __aicore__ inline void SetQuantScalar(const uint64_t quantScalar);
    __aicore__ inline void SetQuantVector(const GlobalTensor<uint64_t>& quantTensor);
    __aicore__ inline void SetTensorA(const LocalTensor<SrcAT>& leftMatrix, bool isTransposeA = false);
    __aicore__ inline void SetTensorAWithCopy(const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT>& leftMatrix,
        bool isTransposeA = false);
    __aicore__ inline void SetTensorB(const LocalTensor<SrcBT>& rightMatrix, bool isTransposeB = false);
    __aicore__ inline void SetTensorA(SrcAT aScalar);
    __aicore__ inline void SetTensorB(SrcBT bScalar);
    __aicore__ inline void SetTensorBWithCopy(const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT>& rightMatrix,
        bool isTransposeB = false);
    __aicore__ inline void SetBias(const LocalTensor<BiasT>& inputBias);
    __aicore__ inline void SetBatchNum(int32_t batchA, int32_t batchB);
    __aicore__ inline void DisableBias();
    __aicore__ inline void ClearBias();
    template <bool sync = true> __aicore__ inline bool Iterate(bool enPartialSum = false);
    template <bool sync = true>
    __aicore__ inline void IterateAll(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
        bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false);
    template <bool sync = true>
    __aicore__ inline void IterateAll(const LocalTensor<DstT>& ubCmatrix, uint8_t enAtomic = 0);

    __aicore__ inline void IterateBatch(const GlobalTensor<DstT>& gm,
        bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
        const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0);
    __aicore__ inline void IterateBatch(const LocalTensor<DstT>& ubCmatrix,
        bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
        const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0);

    template <bool sync = true>
    __aicore__ inline void GetTensorC(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
        bool enSequentialWrite = false);
    template <bool sync = true>
    __aicore__ inline void GetTensorC(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
        bool enSequentialWrite = false);
    template <bool sync = true>
    __aicore__ inline void GetTensorC(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
        uint8_t enAtomic = 0, bool enSequentialWrite = false);
    template <bool isTurnOnDebug = true>
    __aicore__ inline MatrixOffset GetOffsetC();
    __aicore__ inline void End();
    __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0);
    __aicore__ inline void SetSubBlockIdx(uint8_t subBlockIdx);
    __aicore__ inline uint8_t GetSubBlockIdx();
    template <class T> __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size)
    {
        ASCENDC_ASSERT((addr != nullptr),
            { KERNEL_LOG(KERNEL_ERROR, "addr can not be nullptr"); });
        var.cacheWorkspaceAddr = reinterpret_cast<GM_ADDR>(const_cast<__gm__ T*>(addr));
    }
    template <class T> __aicore__ inline void SetWorkspace(GlobalTensor<T>& addr)
    {
        ASSERT(addr.GetSize() > 0);
        SetWorkspace(addr.GetPhyAddr(), addr.GetSize() * sizeof(T));
    }

    __aicore__ inline void SetLocalWorkspace(const LocalTensor<uint8_t>& tmpBuffer)
    {
#if __CCE_AICORE__ < 220
        MATMUL_MODULE(LocalWorkspace)->Init(tmpBuffer);
#else
        ASCENDC_ASSERT((false),
            { KERNEL_LOG(KERNEL_ERROR, "current vecrsion do not support SetLocalWorkspace interface!"); });
#endif
    }

#ifdef ASCENDC_CPU_DEBUG
public:
    uint32_t a1BigPackageLoadCount_ = 0;
    uint32_t b1BigPackageLoadCount_ = 0;
    uint32_t a1LoadCacheCount_ = 0;
    uint32_t b1LoadCacheCount_ = 0;
#endif

public:
    MATMUL_ALLOW_USING(CubeOutBuffer);
    MATMUL_ALLOW_USING(CubeInBufferA);
    MATMUL_ALLOW_USING(CubeInBufferB);
    MATMUL_ALLOW_USING(CopyCubeInA);
    MATMUL_ALLOW_USING(CopyCubeInB);
    MATMUL_ALLOW_USING(CopyCubeOut);
    MATMUL_ALLOW_USING(Context);
    MATMUL_ALLOW_USING(MLoop);
    MATMUL_ALLOW_USING(NLoop);
    MATMUL_ALLOW_USING(KLoop);
    MATMUL_ALLOW_USING(Scheduler);
    MATMUL_ALLOW_USING(BatchScheduler);
    MATMUL_ALLOW_USING(BiasScheduler);

    MATMUL_ALLOW_USING_PRIVATE(LoadToA2);
    MATMUL_ALLOW_USING_PRIVATE(LoadToB2);
    MATMUL_ALLOW_USING_PRIVATE(TBufPoolL0);
    MATMUL_ALLOW_USING_PRIVATE(MmadCompute);
    MATMUL_ALLOW_USING_PRIVATE(IterateController);
    MATMUL_ALLOW_USING_PRIVATE(MatmulQuantProcessor);
    MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsA);
    MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsB);
    MATMUL_ALLOW_USING_PRIVATE(DataCopyUtilsA);
    MATMUL_ALLOW_USING_PRIVATE(DataCopyUtilsB);
    MATMUL_ALLOW_USING_PRIVATE(BatchDataCopyUtilsA);
    MATMUL_ALLOW_USING_PRIVATE(BatchDataCopyUtilsB);
    MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInParamsA);
    MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInParamsB);
    MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoA);
    MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB);
    MATMUL_ALLOW_USING_PRIVATE(MatmulSubBlockInfo);
    MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling);
    MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfo);
    MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInA);
    MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInB);
    MATMUL_ALLOW_USING_PRIVATE(LocalWorkspace);
    MATMUL_ALLOW_USING_PRIVATE(MatmulUserDefineInfo);
    MATMUL_ALLOW_USING_PRIVATE(CopyBiasIn);
    MATMUL_ALLOW_USING_PRIVATE(LoadBias2C2);
    MATMUL_ALLOW_USING_PRIVATE(C1Buffer);
    MATMUL_ALLOW_USING_PRIVATE(C2Buffer);
    MATMUL_ALLOW_USING_PRIVATE(MatmulUnitFlag);
    MATMUL_ALLOW_USING_PRIVATE(BatchLoop);
    template <InputTypeTag TAG>
    using CubeInBuffer = typename AscendC::Conditional<TAG == InputTypeTag::A, CubeInBufferA, CubeInBufferB>::type;

    template <InputTypeTag TAG>
    using BatchCopyCubeInParams = typename AscendC::Conditional<TAG == InputTypeTag::A, BatchCopyCubeInParamsA, BatchCopyCubeInParamsB>::type;

    template <InputTypeTag TAG>
    using CopyCubeInParams =
        typename AscendC::Conditional<TAG == InputTypeTag::A, CopyCubeInParamsA, CopyCubeInParamsB>::type;


    template <InputTypeTag TAG>
    using MatmulTensorInfo =
        typename AscendC::Conditional<TAG == InputTypeTag::A, MatmulTensorInfoA, MatmulTensorInfoB>::type;

    template <InputTypeTag TAG>
    using DataCopyUtils = typename AscendC::Conditional<TAG == InputTypeTag::A, DataCopyUtilsA, DataCopyUtilsB>::type;

    template <InputTypeTag TAG>
    using BatchDataCopyUtils =
        typename AscendC::Conditional<TAG == InputTypeTag::A, BatchDataCopyUtilsA, BatchDataCopyUtilsB>::type;

    using CallBack = MM_CB;

private:
    template<typename, typename> friend struct DfxProxy;
    using IMPL = MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>;
    MATMUL_USE_MODULE(CopyCubeInA);
    MATMUL_USE_MODULE(CopyCubeInB);
    MATMUL_USE_MODULE(BatchCopyCubeInA);
    MATMUL_USE_MODULE(BatchCopyCubeInB);
    MATMUL_USE_MODULE(LocalWorkspace);

    using ChosenCopyCubeInA = typename AscendC::Conditional<Impl::Detail::GetCopyCubeInType<A_TYPE, MM_CFG>() != Impl::Detail::CopyCubeInType::BMM,
                                                            CopyCubeInA, BatchCopyCubeInA>::type;

    using ChosenCopyCubeInB = typename AscendC::Conditional<Impl::Detail::GetCopyCubeInType<B_TYPE, MM_CFG>() != Impl::Detail::CopyCubeInType::BMM,
                                                            CopyCubeInB, BatchCopyCubeInB>::type;
    MATMUL_USE_MODULE(ChosenCopyCubeInA);
    MATMUL_USE_MODULE(ChosenCopyCubeInB);
    MATMUL_USE_MODULE(CubeOutBuffer);
    MATMUL_USE_MODULE(CopyCubeOut);
    MATMUL_USE_MODULE(Scheduler);
    MATMUL_USE_MODULE(BatchScheduler);
    MATMUL_USE_MODULE(BiasScheduler);
    MATMUL_USE_MODULE(MLoop);
    MATMUL_USE_MODULE(NLoop);
    MATMUL_USE_MODULE(KLoop);
    MATMUL_USE_MODULE(BatchLoop);
    MATMUL_USE_MODULE(TBufPoolL0);
    MATMUL_USE_MODULE(LoadToA2);
    MATMUL_USE_MODULE(MatmulShapeTiling);
    MATMUL_USE_MODULE(MatmulShapeInfo);

private:
    template <class A_TYPE_, class B_TYPE_, class C_TYPE_, class BIAS_TYPE_, const auto &MM_CFG_, class MM_CB_,
        MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY_)>
    friend __aicore__ inline void SetTPipe(
        MatmulImpl<A_TYPE_, B_TYPE_, C_TYPE_, BIAS_TYPE_, MM_CFG_, MM_CB_, MATMUL_POLICY_...> &mm, TPipe* tpipe);
    __aicore__ inline void InitNorm(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe);
    __aicore__ inline void InitNormScheduler(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe);
    __aicore__ inline void InitMDL(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe);
    __aicore__ inline void InitBatch(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe);
    __aicore__ inline void InitIBShareNorm(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe);
    template <bool sync = true> __aicore__ inline bool IterateNorm(bool enPartialSum = false);
    template <bool sync = true> __aicore__ inline bool IterateNormScheduler(bool enPartialSum = false);
    template <bool sync = true> __aicore__ inline bool IterateIBShareNorm(bool enPartialSum = false);
    template <bool sync = true> __aicore__ inline bool IterateSpecialMDL(bool enPartialSum = false);
    template <bool sync = true> __aicore__ inline bool IterateNormL0DB(bool enPartialSum);
    __aicore__ inline void EndNorm();
    __aicore__ inline void EndMDL();
    __aicore__ inline void EndIBShareNorm();
    __aicore__ inline void InitStepMParams();
    __aicore__ inline void InitStepNParams();
    __aicore__ inline void InitStepKParams();
    __aicore__ inline void LoadC(bool enPartialSum = false);
    __aicore__ inline void LoadBias(const LocalTensor<L0cT>& cMatrix, int col);
    __aicore__ inline void LoadBias(GlobalTensor<BiasT>& biasGlobal, const LocalTensor<L0cT>& cMatrix, int col);
    __aicore__ inline void Compute(bool enPartialSum = false);
    __aicore__ inline void ComputeNorm(bool enPartialSum = false);
    __aicore__ inline void ComputeIBShareNorm(bool enPartialSum = false);
    __aicore__ inline void ComputeSpecialMDL(bool enPartialSum = false);
    __aicore__ inline void ComputeBatch(const GlobalTensor<DstT>& gm,
        bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
        const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0);
    __aicore__ inline void ComputeBatch(const LocalTensor<DstT>& dst,
        bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
        const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0);
    __aicore__ inline void ComputeNormWithMdb(int kInner);
    __aicore__ inline void ComputeNormWithNdb(int kInner);
    __aicore__ inline void ComputeBatchNormL0DB(int kInner);
    __aicore__ inline void ComputeNormL0DB(bool enPartialSum);
    template <bool sync = true>
    __aicore__ inline void GetTensorCImpl(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
        bool enSequentialWrite = false);
    template <bool sync = true>
    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
        bool enSequentialWrite = false);
    template <bool sync = true>
    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
        uint8_t enAtomic = 0, bool enSequentialWrite = false);
    __aicore__ inline void OnLoadInA2(const LocalTensor<SrcT>& dst, const LocalTensor<SrcT>& aMatrix);
    __aicore__ inline void OnLoadInB2(const LocalTensor<SrcT>& dst, const LocalTensor<SrcT>& bMatrix);
    __aicore__ inline void FixpipeL0CToGm(const GlobalTensor<DstT> &gm, const LocalTensor<L0cT> &co1Local,
        int curM, int curN, uint8_t enAtomic, bool enSequentialWrite);
    __aicore__ inline void FixpipeOutToGm(const GlobalTensor<DstT> &gm, const LocalTensor<L0cT> &co1Local,
        int curM, int curN, uint8_t enAtomic, bool enSequentialWrite);

    __aicore__ inline void CheckIterSize();
    __aicore__ inline void CheckTiling();
    __aicore__ inline void UpdateBatchIterateInfo(const int32_t batchNum, const int32_t batchIdx,
        const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline int32_t GetBatchIterateBiasOffset(const int32_t batchNum, const int32_t batchIdx,
        bool& enableBiase, const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline int32_t GetBatchIterateBOffset(const int32_t batchNum, const int32_t batchIdx,
        const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline int32_t GetBatchIterateAOffset(const int32_t batchNum, const int32_t batchIdx,
        const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline void LoadBatchBiasToL1(const int32_t batchOuterIdx = 0);
    __aicore__ inline void UpdateBatchIterateInfoConstant(const int32_t batchNum, const int32_t batchIdx,
        const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline int32_t GetBatchIterateBOffsetConstant(const int32_t batchNum, const int32_t batchIdx,
        const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline int32_t GetBatchIterateAOffsetConstant(const int32_t batchNum, const int32_t batchIdx,
        const int32_t splitOuterIdx, const int32_t splitSize);
    __aicore__ inline void GetTensorCForBatch(
        const GlobalTensor<DstT> &cGlobal, const int32_t iBatchIn, uint8_t enAtomic, bool enSequentialWriteIn);
    __aicore__ inline void GetTensorCForBatch(
        const LocalTensor<DstT> &dst, const int32_t iBatchIn, uint8_t enAtomic, bool enSequentialWriteIn);
    __aicore__ inline void GetTensorCByLayout(const GlobalTensor<DstT> &cGlobal, uint8_t enAtomic,
        bool enSequentialWrite, const uint32_t nGapOffset, const uint32_t mGapOffset);
    __aicore__ inline int GetND2NZOffsetB();

private:
    typename Impl::Detail::MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS var;

#if __CCE_AICORE__ < 220
    constexpr static int L1Size_ = 1024 * 1024;
    constexpr static int L0CSize_ = 256 * 1024;
#elif __CCE_AICORE__ == 300
    constexpr static int L1Size_ = 1024 * 1024;
    constexpr static int L0CSize_ = 128 * 1024;
#else
    constexpr static int L1Size_ = 512 * 1024;
    constexpr static int L0CSize_ = 128 * 1024;
#endif
    constexpr static int L0ASize_ = 64 * 1024;
    constexpr static int L0BSize_ = 64 * 1024;

    constexpr static int32_t factor_ = AuxGetFactor<SrcT>();
    constexpr static int32_t c0Size_ = AuxGetC0Size<SrcT>();

    int M_;
    int N_;
    int Ka_;
    int Kb_;
    int Kc_;
    int32_t batchA_ = 1, batchB_ = 1;
    int32_t batchOuter_ = 1;

    struct IntraBlockBase {
        __aicore__ inline IntraBlockBase() {};
    };

    struct IntraBlock {
        __aicore__ inline IntraBlock(){};
        __gm__ SrcT* aGlobal;
        __gm__ SrcT* bGlobal;
        __gm__ BiasT* biasGlobal;
        int M;
        int N;
        int Ka;
        int Kb;
        int Kc;
        int singleCoreM;
        int singleCoreN;
        int singleCoreK;
        bool enableBias = false;
        bool isTransposeA = false;
        bool isTransposeB = false;
        bool fakeMsg = false;
    };

    using INTRABLOCK = typename Conditional<ToMatmulConfig(MM_CFG).intraBlockPartSum, IntraBlock, IntraBlockBase>::type;
    INTRABLOCK intraBlockMatmul;
};

// Match CallBack with no policy paramter
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB>
class MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>
: public MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
public:
    __aicore__ inline MatmulImpl() {}
};

// Match Policy with CallBack paramter
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
class MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>
: public MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
public:
    __aicore__ inline MatmulImpl() {}
};

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void SetTPipe(MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...> &mm,
    TPipe* tpipe)
{
    mm.var.tpipe_ = tpipe;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetND2NZOffsetB()
{
    int bTmp = 0;
    if (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename B_TYPE::T, int8_t>::value &&
        !B_TYPE::isTrans && B_TYPE::format == CubeFormat::ND) {
        if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG)) {
            bTmp = var.tiling_.GetBaseK() * var.tiling_.GetBaseN();
        } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
            bTmp = var.tiling_.GetBaseK() *  var.tiling_.GetStepKa() * var.tiling_.GetBaseN() * var.tiling_.GetStepN();
        }
        bTmp += bTmp;
    } else {
        if (!var.isTransposeB_ && (var.tiling_.GetSingleCoreN() % c0Size_ != 0)) {
            bTmp = var.tiling_.GetBaseK() * 32;
        } else if (var.isTransposeB_ && (var.tiling_.GetSingleCoreK() % c0Size_ != 0)) {
            bTmp = var.tiling_.GetBaseN() * 32;
        }
    }
    return bTmp;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetAntiQuantScalar(
    const SrcT offsetScalar, const SrcT scaleScalar)
{
#if __CCE_AICORE__ == 200
    if constexpr (IsSameType<typename A_TYPE::T, half>::value && IsSameType<typename B_TYPE::T, int8_t>::value) {
        var.antiQuantOffsetScalar_ = offsetScalar;
        var.antiQuantScaleScalar_ = scaleScalar;
    } else {
        ASCENDC_ASSERT((false),
            { KERNEL_LOG(KERNEL_ERROR, "A type should be half and B type should be int8"); });
    }
#else
    ASCENDC_ASSERT((false),
        { KERNEL_LOG(KERNEL_ERROR, "Do not support set anti-quant param."); });
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetAntiQuantVector(
    const LocalTensor<SrcT> &offsetTensor, const LocalTensor<SrcT> &scaleTensor)
{
#if __CCE_AICORE__ == 200
    if constexpr (IsSameType<typename A_TYPE::T, half>::value && IsSameType<typename B_TYPE::T, int8_t>::value) {
        var.antiQuantOffsetTensor_ = offsetTensor;
        var.antiQuantScaleTensor_ = scaleTensor;
    } else {
        ASCENDC_ASSERT((false),
            { KERNEL_LOG(KERNEL_ERROR, "A type should be half and B type should be int8"); });
    }
#else
    ASCENDC_ASSERT((false),
        { KERNEL_LOG(KERNEL_ERROR, "Do not support set anti-quant param."); });
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetSelfDefineData(
    const uint64_t dataPtr)
{
#if __CCE_AICORE__ == 220
    var.dataPtr_ = dataPtr;
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetSparseIndex(const GlobalTensor<uint8_t>& indexGlobal)
{
#if __CCE_AICORE__ == 220
    if constexpr (DoMatmulMDL(MM_CFG) && HasSparseIndex<B_TYPE>()) {
        MATMUL_MODULE(CopyCubeInB)->SetSparseIndex(indexGlobal);
    }
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetUserDefInfo(
    const uint64_t tilingPtr)
{
#if __CCE_AICORE__ == 220
    var.tilingPtr_ = tilingPtr;
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetQuantScalar(
    const uint64_t quantScalar)
{
    MatmulQuantProcessor::SetQuantScalar(quantScalar);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetQuantVector(
    const GlobalTensor<uint64_t>& quantTensor)
{
    MatmulQuantProcessor::SetQuantVector(quantTensor);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::CheckIterSize()
{
    ASCENDC_ASSERT((var.nIter_ > 0),
                   { KERNEL_LOG(KERNEL_ERROR, "var.nIter_ is %d , which should be larger than 0", var.nIter_); });
    ASCENDC_ASSERT((var.mIter_ > 0),
                   { KERNEL_LOG(KERNEL_ERROR, "var.mIter_ is %d , which should be larger than 0", var.mIter_); });
    ASCENDC_ASSERT((var.kIter_ > 0),
                   { KERNEL_LOG(KERNEL_ERROR, "var.kIter_ is %d , which should be larger than 0", var.kIter_); });
    if constexpr (DoMatmulMDL(MM_CFG)) {
        if (var.kIter_ > var.tiling_.GetStepKa()) {
            ASCENDC_ASSERT((var.tiling_.GetStepM() == 1),
                           { KERNEL_LOG(KERNEL_ERROR, "stepM is %d which can only be 1", var.tiling_.GetStepM()); });
        }
        if (var.kIter_ > var.tiling_.GetStepKb()) {
            ASCENDC_ASSERT((var.tiling_.GetStepN() == 1),
                           { KERNEL_LOG(KERNEL_ERROR, "stepN is %d which can only be 1", var.tiling_.GetStepN()); });
        }
    }
    if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        if (var.kIter_ > var.tiling_.GetStepKa()) {
            ASCENDC_ASSERT((var.tiling_.GetStepM() == 1),
                           { KERNEL_LOG(KERNEL_ERROR, "stepM is %d which can only be 1", var.tiling_.GetStepM()); });
        }
    }
}
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline uint8_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetSubBlockIdx()
{
    return var.subBlockIdx_;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::CheckTiling()
{
#ifdef ASCENDC_CPU_DEBUG
    ASCENDC_ASSERT((M_ > 0), { KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be larger than 0", M_); });
    ASCENDC_ASSERT((N_ > 0), { KERNEL_LOG(KERNEL_ERROR, "N_ is %d , which should be larger than 0", N_); });
    ASCENDC_ASSERT((Ka_ > 0), { KERNEL_LOG(KERNEL_ERROR, "Ka_ is %d , which should be larger than 0", Ka_); });
    ASCENDC_ASSERT((Kb_ > 0), { KERNEL_LOG(KERNEL_ERROR, "Kb_ is %d , which should be larger than 0", Kb_); });
    ASCENDC_ASSERT((var.singleCoreM_ > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.singleCoreM_ is %d , which should be larger than 0", var.singleCoreM_);
    });
    ASCENDC_ASSERT((var.singleCoreN_ > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.singleCoreN_ is %d , which should be larger than 0", var.singleCoreN_);
    });
    ASCENDC_ASSERT((var.singleCoreK_ > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.singleCoreK_ is %d , which should be larger than 0", var.singleCoreK_);
    });
    ASCENDC_ASSERT((var.tiling_.GetBaseM() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetBaseM() is %d , which should be larger than 0", var.tiling_.GetBaseM());
    });
    ASCENDC_ASSERT((var.tiling_.GetBaseN() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetBaseN() is %d , which should be larger than 0", var.tiling_.GetBaseN());
    });
    ASCENDC_ASSERT((var.tiling_.GetBaseK() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetBaseK() is %d , which should be larger than 0", var.tiling_.GetBaseK());
    });
    ASCENDC_ASSERT((var.tiling_.GetDepthA1() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetDepthA1() is %d , which should be larger than 0", var.tiling_.GetDepthA1());
    });
    ASCENDC_ASSERT((var.tiling_.GetDepthB1() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetDepthB1() is %d , which should be larger than 0", var.tiling_.GetDepthB1());
    });
    ASCENDC_ASSERT((var.tiling_.GetStepM() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetStepM() is %d , which should be larger than 0", var.tiling_.GetStepM());
    });
    ASCENDC_ASSERT((var.tiling_.GetStepN() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetStepN() is %d , which should be larger than 0", var.tiling_.GetStepN());
    });
    ASCENDC_ASSERT((var.tiling_.IsBias() >= 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.IsBias() is %d , which should be not less than 0", var.tiling_.IsBias());
    });

#if __CCE_AICORE__ < 220
    ASCENDC_ASSERT((var.tiling_.GetTransLength() > 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetTransLength() is %d , which should be larger than 0",
            var.tiling_.GetTransLength());
    });
    if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) {
        ASCENDC_ASSERT(var.tiling_.GetTransLength() * 4 <= 256 * 1024, { KERNEL_LOG(KERNEL_ERROR,
            "When enableUBReuse is false, var.tiling_.GetTransLength() * 4 should be less than UB size");});
    }
#endif
    ASCENDC_ASSERT((var.tiling_.GetIterateOrder() >= 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetIterateOrder() is %d , which should be not less than 0",
            var.tiling_.GetIterateOrder());
    });
    ASCENDC_ASSERT((var.tiling_.GetShareMode() >= 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetShareMode() is %d , which should be not less than 0",
            var.tiling_.GetShareMode());
    });
    ASCENDC_ASSERT((var.tiling_.GetShareL1Size() >= 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetShareL1Size() is %d , which should be not less than 0",
            var.tiling_.GetShareL1Size());
    });
    ASCENDC_ASSERT((var.tiling_.GetShareL0CSize() >= 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetShareL0CSize() is %d , which should be not less than 0",
            var.tiling_.GetShareL0CSize());
    });
    ASCENDC_ASSERT((var.tiling_.GetShareUbSize() >= 0), {
        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetShareUbSize() is %d , which should be not less than 0",
            var.tiling_.GetShareUbSize());
    });

    ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0ASize_), {
        KERNEL_LOG(KERNEL_ERROR, "baseM * baseK is %d , which should be not larger than L0ASize_ %d",
            var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT), L0ASize_);
    });
    ASCENDC_ASSERT((var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0BSize_), {
        KERNEL_LOG(KERNEL_ERROR, "baseN * baseK is %d , which should be not larger than L0BSize_ %d",
            var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT), L0BSize_);
    });
    ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseN() * sizeof(L0cT) <= L0CSize_), {
        KERNEL_LOG(KERNEL_ERROR, "baseM * baseN is %d , which should be not larger than L0CSize_ %d",
            var.tiling_.GetBaseM() * var.tiling_.GetBaseN() * sizeof(L0cT), L0CSize_);
    });
#if __CCE_AICORE__ == 220
    if constexpr ((DoMatmulNorm(MM_CFG) || DoMatmulMDL(MM_CFG)) && ToMatmulConfig(MM_CFG).isA2B2Shared) {
        ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0ASize_ / Impl::DB_FACTOR), {
            KERNEL_LOG(KERNEL_ERROR, "baseM * baseK is %d , which should be not larger than A2 Size / 2 when isA2B2Shared is enable %d",
                var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT), L0ASize_ / Impl::DB_FACTOR);
        });
        ASCENDC_ASSERT((var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0BSize_ / Impl::DB_FACTOR), {
            KERNEL_LOG(KERNEL_ERROR, "baseN * baseK is %d , which should be not larger than B2 Size / 2 when isA2B2Shared is enable %d",
                var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT), L0BSize_ / Impl::DB_FACTOR);
        });
    }
#endif

    if (var.tiling_.GetShareMode() == 1) {
        ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0ASize_ / HALF_FACTOR), {
            KERNEL_LOG(KERNEL_ERROR,
                "baseM is %d , baseK is %d, baseM * baseK should be less than half l0a when in mode 1",
                var.tiling_.GetBaseM(), var.tiling_.GetBaseK());
        });
        ASCENDC_ASSERT((var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0BSize_ / HALF_FACTOR), {
            KERNEL_LOG(KERNEL_ERROR,
                "baseN is %d , baseK is %d, baseN * baseK should be less than half l0b when in mode 1",
                var.tiling_.GetBaseN(), var.tiling_.GetBaseK());
        });
        ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseN() * sizeof(L0cT) <= L0CSize_ / HALF_FACTOR), {
            KERNEL_LOG(KERNEL_ERROR,
                "baseM is %d , baseN is %d, baseM * baseN should be less than half l0c when in mode 1",
                var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
        });
    }
#if __CCE_AICORE__ >= 220
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        ASCENDC_ASSERT((var.tiling_.GetDepthA1() % (var.tiling_.GetStepM() * var.tiling_.GetStepKa()) == 0), {
            KERNEL_LOG(KERNEL_ERROR, "depthA1 is %d , which should be divided exactly by stepM * stepKa(%d * %d)",
                var.tiling_.GetDepthA1(), var.tiling_.GetStepM(), var.tiling_.GetStepKa());
        });
        ASCENDC_ASSERT((var.tiling_.GetDepthB1() % (var.tiling_.GetStepN() * var.tiling_.GetStepKb()) == 0), {
            KERNEL_LOG(KERNEL_ERROR, "depthB1 is %d , which should be divided exactly by stepN * stepKb(%d * %d)",
                var.tiling_.GetDepthB1(), var.tiling_.GetStepN(), var.tiling_.GetStepKb());
        });
        ASCENDC_ASSERT((var.tiling_.GetDepthA1() / (var.tiling_.GetStepM() * var.tiling_.GetStepKa()) <= 2), {
            KERNEL_LOG(KERNEL_ERROR, "depthA1 is %d , stepM %d, stepKa %d, depthA1 <= 2 * (stepM * stepKa)",
                var.tiling_.GetDepthA1(), var.tiling_.GetStepM(), var.tiling_.GetStepKa());
        });
        ASCENDC_ASSERT((var.tiling_.GetDepthB1() / (var.tiling_.GetStepN() * var.tiling_.GetStepKb()) <= 2), {
            KERNEL_LOG(KERNEL_ERROR, "depthB1 is %d , stepN %d, stepKb %d, depthB1 <= 2 * (stepN * stepKb)",
                var.tiling_.GetDepthB1(), var.tiling_.GetStepN(), var.tiling_.GetStepKb());
        });
    }
#endif
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitStepMParams()
{
    if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        if (var.subBlockIdx_ == 0) {
            var.mIter_ = Ceil(var.singleCoreM_, var.tiling_.GetBaseM());
            var.tailM_ = var.singleCoreM_ % var.tiling_.GetBaseM();
            if (var.tailM_ == 0) {
                var.tailM_ = var.tiling_.GetBaseM();
            }
        } else {
            intraBlockMatmul.mIter = Ceil(intraBlockMatmul.singleCoreM, var.tiling_.GetBaseM());
            intraBlockMatmul.tailM = intraBlockMatmul.singleCoreM % var.tiling_.GetBaseM();
            if (intraBlockMatmul.tailM == 0) {
                intraBlockMatmul.tailM = var.tiling_.GetBaseM();
            }
        }
    } else {
        if constexpr (IsBasicM(MM_CFG)) {
            var.mIter_ = 1;
        } else {
            var.mIter_ = Ceil(var.singleCoreM_, var.tiling_.GetBaseM());
        }
        if constexpr (NoTailM(MM_CFG)) {
            var.tailM_ = var.tiling_.GetBaseM();
        } else {
            var.tailM_ = var.singleCoreM_ % var.tiling_.GetBaseM();
            if (var.tailM_ == 0) {
                var.tailM_ = var.tiling_.GetBaseM();
            }
        }
    }
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        var.mStepIter_ = Ceil(var.singleCoreM_, var.tiling_.GetBaseM() * var.tiling_.GetStepM());
        var.tailStepM_ = var.singleCoreM_ % (var.tiling_.GetBaseM() * var.tiling_.GetStepM());
        if (var.tailStepM_ == 0) {
            var.tailStepM_ = var.tiling_.GetBaseM() * var.tiling_.GetStepM();
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitStepNParams()
{
    if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        if (var.subBlockIdx_ == 0) {
            var.nIter_ = Ceil(var.singleCoreN_, var.tiling_.GetBaseN());
            var.tailN_ = var.singleCoreN_ % var.tiling_.GetBaseN();
            if (var.tailN_ == 0) {
                var.tailN_ = var.tiling_.GetBaseN();
            }
        } else {
            intraBlockMatmul.nIter = Ceil(intraBlockMatmul.singleCoreN, var.tiling_.GetBaseN());
            intraBlockMatmul.tailN = intraBlockMatmul.singleCoreN % var.tiling_.GetBaseN();
            if (intraBlockMatmul.tailN == 0) {
                intraBlockMatmul.tailN = var.tiling_.GetBaseN();
            }
        }
    } else {
        if constexpr (IsBasicN(MM_CFG)) {
            var.nIter_ = 1;
        } else {
            var.nIter_ = Ceil(var.singleCoreN_, var.tiling_.GetBaseN());
        }
        if constexpr (NoTailN(MM_CFG)) {
            var.tailN_ = var.tiling_.GetBaseN();
        } else {
            var.tailN_ = var.singleCoreN_ % var.tiling_.GetBaseN();
            if (var.tailN_ == 0) {
                var.tailN_ = var.tiling_.GetBaseN();
            }
        }
        if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
            var.nStepIter_ = Ceil(var.singleCoreN_, var.tiling_.GetBaseN() * var.tiling_.GetStepN());
            var.tailStepN_ = var.singleCoreN_ % (var.tiling_.GetBaseN() * var.tiling_.GetStepN());
            if (var.tailStepN_ == 0) {
                var.tailStepN_ = var.tiling_.GetBaseN() * var.tiling_.GetStepN();
            }
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitStepKParams()
{
    if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        if (var.subBlockIdx_ == 0) {
            var.kIter_ = Ceil(var.singleCoreK_, var.tiling_.GetBaseK());
            var.tailK_ = var.singleCoreK_ % var.tiling_.GetBaseK();
            if (var.tailK_ == 0) {
                var.tailK_ = var.tiling_.GetBaseK();
            }
        } else {
            intraBlockMatmul.kIter = Ceil(intraBlockMatmul.singleCoreK, var.tiling_.GetBaseK());
            intraBlockMatmul.tailK = intraBlockMatmul.singleCoreK % var.tiling_.GetBaseK();
            if (intraBlockMatmul.tailK == 0) {
                intraBlockMatmul.tailK = var.tiling_.GetBaseK();
            }
        }
    } else {
        if constexpr (IsBasicK(MM_CFG)) {
            var.kIter_ = 1;
        } else {
            var.kIter_ = Ceil(var.singleCoreK_, var.tiling_.GetBaseK());
        }
        if constexpr (NoTailK(MM_CFG)) {
            var.tailK_ = var.tiling_.GetBaseK();
        } else {
            var.tailK_ = var.singleCoreK_ % var.tiling_.GetBaseK();
            if (var.tailK_ == 0) {
                var.tailK_ = var.tiling_.GetBaseK();
            }
        }
        if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
            var.kaStepIter_ = Ceil(var.singleCoreK_, var.tiling_.GetBaseK() * var.tiling_.GetStepKa());
            var.kbStepIter_ = Ceil(var.singleCoreK_, var.tiling_.GetBaseK() * var.tiling_.GetStepKb());
            ASCENDC_ASSERT((var.kaStepIter_ % var.kbStepIter_ == 0 || var.kbStepIter_ % var.kaStepIter_ == 0), {
                KERNEL_LOG(KERNEL_ERROR,
                    "kaStepIter_ %d ,  kbStepIter_ is %d, kbStepIter_ is %d, kaStepIter_ is %d,"
                    "(kaStepIter_ % kbStepIter_) or (kbStepIter_ % kaStepIter_) should be 0",
                    var.kaStepIter_, var.kbStepIter_, var.kbStepIter_, var.kaStepIter_);
            });
            var.kStepIter_ = var.kaStepIter_ > var.kbStepIter_ ? var.kaStepIter_ : var.kbStepIter_;
            var.tailStepKa_ = var.singleCoreK_ % (var.tiling_.GetBaseK() * var.tiling_.GetStepKa());
            var.tailStepKb_ = var.singleCoreK_ % (var.tiling_.GetBaseK() * var.tiling_.GetStepKb());
            if (var.tailStepKa_ == 0) {
                var.tailStepKa_ = var.tiling_.GetBaseK() * var.tiling_.GetStepKa();
            }
            if (var.tailStepKb_ == 0) {
                var.tailStepKb_ = var.tiling_.GetBaseK() * var.tiling_.GetStepKb();
            }

            var.isA1KFullLoad_ = (var.tiling_.GetStepKa() >= var.kIter_);
            var.isB1KFullLoad_ = (var.tiling_.GetStepKb() >= var.kIter_);
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::Init(
    const TCubeTiling* __restrict cubeTiling, TPipe* tpipe)
{
#if __CCE_AICORE__ == 200
    if (C_TYPE::format == CubeFormat::ND && (cubeTiling->N * sizeof(DstT) % ONE_BLK_SIZE != 0)) {
        ASCENDC_ASSERT(
            (false), { KERNEL_LOG(KERNEL_ERROR, "N dims need to be aligined to 32B when ND format output in v200."); });
    }
#endif
#if __CCE_AICORE__ == 220
    if constexpr (ToMatmulConfig(MM_CFG).isEnableChannelSplit)
    {
        ASCENDC_ASSERT(
            ((C_TYPE::format == CubeFormat::NZ) && IsSameType<DstT, float>::value), { KERNEL_LOG(KERNEL_ERROR, "ChannelSplit supports only NZ format and float data type output in v220."); });
    }
#endif
    auto tpipePtr = GetTPipePtr();
    if constexpr (A_TYPE::layout != LayoutMode::NONE) {
        InitBatch(cubeTiling, tpipePtr);
    } else if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) {
        if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> || IsIntrablock<MM_CFG>) {
            InitNormScheduler(cubeTiling, tpipePtr);
        } else {
            InitNorm(cubeTiling, tpipePtr);
        }
    } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
#if __CCE_AICORE__ < 200
        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "MatmulVersion MULTI_DATA_LOAD is valid only in v220."); });
#endif
        InitMDL(cubeTiling, tpipePtr);
    } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) {
        InitIBShareNorm(cubeTiling, tpipePtr);
    } else {
        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul version."); });
    }
    if constexpr (A_TYPE::layout == LayoutMode::NONE && !ToMatmulConfig(MM_CFG).isBiasBatch) {
        ASCENDC_ASSERT(
            (false), { KERNEL_LOG(KERNEL_ERROR, "Bias reuse is only valid in BMM."); });
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitBatch(
    const TCubeTiling* __restrict cubeTiling, TPipe* tpipe)
{
    ASCENDC_ASSERT(!DoMatmulMDL(MM_CFG), { KERNEL_LOG(KERNEL_ERROR, "BatchMatmul unsupport MDL."); });
    if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
        if constexpr (!ToMatmulConfig(MM_CFG).isBiasBatch) {
            ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR,
                "Bias reuse does not supported BatchMode::SINGLE_LARGE_THAN_L1");});
        }
        InitNormScheduler(cubeTiling, tpipe);  
        return;
    }
    var.isTransposeA_ = false;
    var.isTransposeB_ = false;
    var.enableBias_ = false;
#if __CCE_AICORE__ < 220
    var.subBlockIdx_ = 0;
#endif
    var.tiling_.SetTiling(cubeTiling);
#if __CCE_AICORE__ == 220
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
        ASCENDC_ASSERT(var.tiling_.GetSingleCoreK() <= var.tiling_.GetBaseK(), { KERNEL_LOG(KERNEL_ERROR,
            "When singleCoreK is larger than baseK, the parameter scheduleType of MM_CFG should not be OUTER_PRODUCT");});
    }
#endif
    var.tpipe_ = tpipe;

    M_ = var.tiling_.GetM();
    N_ = var.tiling_.GetN();
    Ka_ = var.tiling_.GetKa();
    Kb_ = var.tiling_.GetKb();
    Kc_ = N_;
    var.singleCoreM_ = var.tiling_.GetSingleCoreM();
    var.singleCoreN_ = var.tiling_.GetSingleCoreN();
    var.singleCoreK_ = var.tiling_.GetSingleCoreK();

    MATMUL_MODULE(MLoop)->Init(var.singleCoreM_);
    MATMUL_MODULE(NLoop)->Init(var.singleCoreN_);
    MATMUL_MODULE(KLoop)->Init(var.singleCoreK_);
    MATMUL_MODULE(BatchLoop)->Init();
    MATMUL_MODULE(TBufPoolL0)->Init((MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0A() - 1) &
        (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0B() - 1));
    CheckTiling();

    uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_.GetShareUbSize());
#if __CCE_AICORE__ == 200
    shareUbSize = 0;
#endif
    uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_.GetShareL1Size()),
        static_cast<uint32_t>(var.tiling_.GetShareL0CSize()), shareUbSize};
    InitShareBufStart(var.tpipe_, var.tiling_.GetShareMode(), shareLens, 3, var.subBlockIdx_);

    MATMUL_MODULE(BatchCopyCubeInA)->Init();
    MATMUL_MODULE(BatchCopyCubeInB)->Init();
    uint32_t lenFactor = 1;
#if __CCE_AICORE__ >= 220
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
        lenFactor = Impl::DOUBLE_SIZE;
    }
#endif

    MATMUL_MODULE(CubeOutBuffer)->Init(var.tiling_.GetBaseM() * var.tiling_.GetBaseN(), lenFactor);
    if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::BATCH_LARGE_THAN_L1) {
        MATMUL_MODULE(BiasScheduler)->Init(MATMUL_MODULE(BatchLoop)->GetBatchNum());
    } else if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::BATCH_LESS_THAN_L1) {
        MATMUL_MODULE(BiasScheduler)->Init(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBatchNum());
    }

    InitShareBufEnd(var.tpipe_);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitNormScheduler(
    const TCubeTiling* __restrict cubeTiling, TPipe* tpipe)
{
#if __CCE_AICORE__ < 220
    // when output is int8 and ND format, do not support on the fly trans nd2nz
    if constexpr (C_TYPE::format == CubeFormat::ND && !ToMatmulConfig(MM_CFG).enVecND2NZ &&
        (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value)) {
        ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR,
            "When output's data format is ND and data type is int8_t or uint8_t,"
            " the parameter enVecND2NZ of MM_CFG should be true");});
    }
#endif
    var.isTransposeA_ = false;
    var.isTransposeB_ = false;
#if __CCE_AICORE__ < 220 || __CCE_AICORE__ == 300
    var.subBlockIdx_ = 0;
#endif
    var.tiling_.SetTiling(cubeTiling);
    var.tpipe_ = tpipe;

    M_ = var.tiling_.GetM();
    N_ = var.tiling_.GetN();
    Ka_ = var.tiling_.GetKa();
    Kb_ = var.tiling_.GetKb();
    Kc_ = N_;
    if constexpr (DoMatmulSpecialBasicBlock(MM_CFG) && MM_CFG.singleCoreM != 0 && MM_CFG.singleCoreN != 0 &&
        MM_CFG.singleCoreK != 0) {
        var.singleCoreM_ = ToMatmulConfig(MM_CFG).singleCoreM;
        var.singleCoreN_ = ToMatmulConfig(MM_CFG).singleCoreN;
        var.singleCoreK_ = ToMatmulConfig(MM_CFG).singleCoreK;
    } else {
        var.singleCoreM_ = var.tiling_.GetSingleCoreM();
        var.singleCoreN_ = var.tiling_.GetSingleCoreN();
        var.singleCoreK_ = var.tiling_.GetSingleCoreK();
    }

    CheckTiling();

    MATMUL_MODULE(MLoop)->Init(var.singleCoreM_);
    MATMUL_MODULE(NLoop)->Init(var.singleCoreN_);
    MATMUL_MODULE(KLoop)->Init(var.singleCoreK_);
    MATMUL_MODULE(TBufPoolL0)->Init((MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0A() - 1) &
            (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0B() - 1));
    if constexpr (unlikely(Impl::Detail::MatmulFeatureTrait<MM_CFG>::IsUnitFlagEnabled())) {
        SetMMLayoutTransform(0);
    }

    if constexpr (IsBasicBlockEnable<MM_CFG>) {
        var.baseMN_ = MM_CFG.basicM * MM_CFG.basicN;
    } else {
        var.baseMN_ = var.tiling_.GetBaseM() * var.tiling_.GetBaseN();
    }

    uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_.GetShareUbSize());
#if __CCE_AICORE__ == 200
    shareUbSize = 0;
#endif
    uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_.GetShareL1Size()),
        static_cast<uint32_t>(var.tiling_.GetShareL0CSize()), shareUbSize};
    InitShareBufStart(var.tpipe_, var.tiling_.GetShareMode(), shareLens, 3, var.subBlockIdx_);
    MATMUL_MODULE(CopyCubeInA)->Init();
    MATMUL_MODULE(CopyCubeInB)->Init();
    MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, 1);

#if __CCE_AICORE__ >= 220
    MATMUL_MODULE(BiasScheduler)->Init();
#endif
    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))
#if __CCE_AICORE__ == 220
        // only c220 support A16W16C8 in CFG_NORM
        || ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
        IsSameType<DstT, int8_t>::value)
#endif
        ) {
        MatmulQuantProcessor::Init(var.tiling_.GetBaseN());
    }

    InitShareBufEnd(var.tpipe_);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitNorm(
    const TCubeTiling* __restrict cubeTiling, TPipe* tpipe)
{
#if __CCE_AICORE__ < 220
    // when output is int8 and ND format, do not support on the fly trans nd2nz
    if constexpr (C_TYPE::format == CubeFormat::ND && !ToMatmulConfig(MM_CFG).enVecND2NZ &&
        (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value)) {
        ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR,
            "When output's data format is ND and data type is int8_t or uint8_t,"
            " the parameter enVecND2NZ of MM_CFG should be true");});
    }
#endif
    var.isTransposeA_ = false;
    var.isTransposeB_ = false;
    var.enableBias_ = false;
#if __CCE_AICORE__ < 220 || __CCE_AICORE__ == 300
    var.subBlockIdx_ = 0;
#endif
    var.tiling_.SetTiling(cubeTiling);
    var.tpipe_ = tpipe;
#if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 200 || __CCE_AICORE__ == 300
    MatmulInstr::Init();
#endif

    M_ = var.tiling_.GetM();
    N_ = var.tiling_.GetN();
    Ka_ = var.tiling_.GetKa();
    Kb_ = var.tiling_.GetKb();
    Kc_ = N_;
    var.singleCoreM_ = var.tiling_.GetSingleCoreM();
    var.singleCoreN_ = var.tiling_.GetSingleCoreN();
    var.singleCoreK_ = var.tiling_.GetSingleCoreK();
    if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        intraBlockMatmul.singleCoreM = var.tiling_.GetSingleCoreM();
        intraBlockMatmul.singleCoreN = var.tiling_.GetSingleCoreN();
        intraBlockMatmul.singleCoreK = var.tiling_.GetSingleCoreK();
        intraBlockMatmul.enableBias = false;
    }

    if constexpr (DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) {
        var.baseUseM_ = var.tiling_.GetBaseM();
        var.baseUseN_ = var.tiling_.GetBaseN();
        var.blockUseM_ = var.baseUseM_ / BLOCK_CUBE;
        var.blockUseN_ = var.baseUseN_ / BLOCK_CUBE;

        ASCENDC_ASSERT((!(A_TYPE::format == CubeFormat::SCALAR || A_TYPE::format == CubeFormat::VECTOR) &&
            !(PhyPosIsL1(A_TYPE::pos) || PhyPosIsL1(B_TYPE::pos))),
                       { KERNEL_LOG(KERNEL_ERROR, "Currently basic block does not support GEMV and TSCM."); });
    }

    InitStepMParams();
    InitStepNParams();
    InitStepKParams();

    CheckTiling();
    CheckIterSize();

    var.baseMN_ = var.tiling_.GetBaseM() * var.tiling_.GetBaseN();
    uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_.GetShareUbSize());
#if __CCE_AICORE__ == 200
    shareUbSize = 0;
#endif
    uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_.GetShareL1Size()),
        static_cast<uint32_t>(var.tiling_.GetShareL0CSize()), shareUbSize};
    InitShareBufStart(var.tpipe_, var.tiling_.GetShareMode(), shareLens, 3, var.subBlockIdx_);
    MATMUL_MODULE(CopyCubeInA)->Init();
    MATMUL_MODULE(CopyCubeInB)->Init();
    MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, 1);

#if __CCE_AICORE__ >= 220
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.tiling_.IsBias()) {
            if constexpr (A_TYPE::layout == LayoutMode::NONE && ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT
                && ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
                var.tpipe_->InitBuffer(var.qidBias_, 1, Impl::DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT));
            } else {
                var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_.GetBaseN() * sizeof(BiasT));
            }
        }
    }
#endif
    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))
#if __CCE_AICORE__ == 220
        // only c220 support A16W16C8 in CFG_NORM
        || ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
        IsSameType<DstT, int8_t>::value)
#endif
        ) {
        MatmulQuantProcessor::Init(var.tiling_.GetBaseN());
    }
#if (__CCE_AICORE__ < 200)
    var.tpipe_->InitBuffer(var.qidA2_, 1, L0ASize_);
    var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_);
#endif

    InitShareBufEnd(var.tpipe_);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitMDL(
    const TCubeTiling* __restrict cubeTiling, TPipe* tpipe)
{
    ASCENDC_ASSERT((tpipe != nullptr), { KERNEL_LOG(KERNEL_ERROR, "tpipe can not be nullptr"); });
#if __CCE_AICORE__ < 220
    // when output is int8 and ND format, do not support on the fly trans nd2nz
    if constexpr (C_TYPE::format == CubeFormat::ND && !ToMatmulConfig(MM_CFG).enVecND2NZ &&
        (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value)) {
        ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR,
            "When output's data format is ND and data type is int8_t or uint8_t,"
            " the parameter enVecND2NZ of MM_CFG should be true");});
    }
#endif

#if __CCE_AICORE__ != 220
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
        ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR,
            "ScheduleType is OUTER_PRODUCT only supported on A2");});
    }
#endif

    var.isTransposeA_ = false;
    var.isTransposeB_ = false;
#if __CCE_AICORE__ < 220
    var.subBlockIdx_ = 0;
#endif
    var.tiling_.SetTiling(cubeTiling);
    var.tpipe_ = tpipe;

    M_ = var.tiling_.GetM();
    N_ = var.tiling_.GetN();
    Ka_ = var.tiling_.GetKa();
    Kb_ = var.tiling_.GetKb();
    Kc_ = N_;
    var.singleCoreM_ = var.tiling_.GetSingleCoreM();
    var.singleCoreN_ = var.tiling_.GetSingleCoreN();
    var.singleCoreK_ = var.tiling_.GetSingleCoreK();

    CheckTiling();

    MATMUL_MODULE(MLoop)->Init(var.singleCoreM_);
    MATMUL_MODULE(NLoop)->Init(var.singleCoreN_);
    MATMUL_MODULE(KLoop)->Init(var.singleCoreK_);
    MATMUL_MODULE(TBufPoolL0)->Init((MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0A() - 1) &
            (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0B() - 1));
    if constexpr (unlikely(Impl::Detail::MatmulFeatureTrait<MM_CFG>::IsUnitFlagEnabled())) {
        SetMMLayoutTransform(0);
    }
    var.isA1KFullLoad_ = (var.tiling_.GetStepKa() >= MATMUL_MODULE(KLoop)->GetTotalIter());
    var.isB1KFullLoad_ = (var.tiling_.GetStepKb() >= MATMUL_MODULE(KLoop)->GetTotalIter());

#if __CCE_AICORE__ == 220
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
        ASCENDC_ASSERT((ToMatmulConfig(MM_CFG).iterateOrder != IterateOrder::UNDEF), {KERNEL_LOG(KERNEL_ERROR,
            "When scheduleType is OUTER_PRODUCT, iterateOrder of MM_CFG should not be UNDEF.");});
    }
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT && ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
        ASCENDC_ASSERT((var.tiling_.GetStepN() > 1), {KERNEL_LOG(KERNEL_ERROR,
            "When scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_M, stepN should be larger than 1");});
    } else if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT && ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_N) {
        ASCENDC_ASSERT((var.tiling_.GetStepM() > 1), {KERNEL_LOG(KERNEL_ERROR,
            "When scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_N, stepM should be larger than 1");});
    }
#endif

    var.baseMN_ = var.tiling_.GetBaseM() * var.tiling_.GetBaseN();

    uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_.GetShareUbSize());
#if __CCE_AICORE__ == 200
        shareUbSize = 0;
#endif
    uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_.GetShareL1Size()),
        static_cast<uint32_t>(var.tiling_.GetShareL0CSize()), shareUbSize};
    InitShareBufStart(var.tpipe_, var.tiling_.GetShareMode(), shareLens, 3, var.subBlockIdx_);
    MATMUL_MODULE(CopyCubeInA)->Init();
    MATMUL_MODULE(CopyCubeInB)->Init();
    // cacheA1Factor_/cacheB1Factor_ used within preload
    if constexpr (ToMatmulConfig(MM_CFG).doMTE2Preload > 0) {
        uint32_t cacheA1Size = var.tiling_.GetStepM() * var.tiling_.GetStepKa();
        var.cacheA1Factor_ = (var.tiling_.GetDepthA1() / cacheA1Size - 1) & 1;
        uint32_t cacheB1Size = var.tiling_.GetStepN() * var.tiling_.GetStepKb();
        var.cacheB1Factor_ = (var.tiling_.GetDepthB1() / cacheB1Size - 1) & 1;
    }

    uint32_t lenFactor = 1;
#if __CCE_AICORE__ >= 220
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
        lenFactor = Impl::DOUBLE_SIZE;
    }
#endif
    MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, lenFactor);

#if __CCE_AICORE__ > 220
    MATMUL_MODULE(BiasScheduler)->Init();
#endif

#if __CCE_AICORE__ == 220
    MATMUL_MODULE(BiasScheduler)->Init();
    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) && IsSameType<DstT, int8_t>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))) {
        MatmulQuantProcessor::Init(var.tiling_.GetBaseN());
    }
#else
    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, half>::value && IsSameType<DstT, int8_t>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))) {
        MatmulQuantProcessor::Init(var.tiling_.GetBaseN());
    }
#endif
#if (__CCE_AICORE__ < 200)
    var.tpipe_->InitBuffer(var.qidA2_, 1, L0ASize_);
    var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_);
#endif
    InitShareBufEnd(var.tpipe_);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::InitIBShareNorm(
    const TCubeTiling* __restrict cubeTiling, TPipe* tpipe)
{
    var.isTransposeA_ = false;
    var.isTransposeB_ = false;
    var.enHF32Mode_ = false;
    var.hf32TransMode_ = 0;
    var.tiling_.SetTiling(cubeTiling);
    var.tpipe_ = tpipe;

    M_ = var.tiling_.GetM();
    N_ = var.tiling_.GetN();
    Ka_ = var.tiling_.GetKa();
    Kb_ = var.tiling_.GetKb();
    Kc_ = N_;
    var.singleCoreM_ = var.tiling_.GetSingleCoreM();
    var.singleCoreN_ = var.tiling_.GetSingleCoreN();
    var.singleCoreK_ = var.tiling_.GetSingleCoreK();

    if (var.singleCoreM_ > 0 && var.singleCoreN_ > 0 && var.singleCoreK_ > 0) {
        MATMUL_MODULE(MLoop)->Init(var.singleCoreM_);
        MATMUL_MODULE(NLoop)->Init(var.singleCoreN_);
        MATMUL_MODULE(KLoop)->Init(var.singleCoreK_);
    }

    MATMUL_MODULE(TBufPoolL0)
        ->Init((MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0A() - 1) &
               (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0B() - 1));
    if constexpr (unlikely(Impl::Detail::MatmulFeatureTrait<MM_CFG>::IsUnitFlagEnabled())) {
        SetMMLayoutTransform(0);
    }

    CheckTiling();

    var.baseMN_ = var.tiling_.GetBaseM() * var.tiling_.GetBaseN();

    uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_.GetShareUbSize());
    uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_.GetShareL1Size()),
        static_cast<uint32_t>(var.tiling_.GetShareL0CSize()), shareUbSize};
    InitShareBufStart(var.tpipe_, var.tiling_.GetShareMode(), shareLens, 3, var.subBlockIdx_);

    if constexpr (A_TYPE::ibShare) {
        ASCENDC_ASSERT((B_TYPE::ibShare == false), {
            KERNEL_LOG(KERNEL_ERROR, "When A is ibShare, B should not be ibShare");
        });
        ASCENDC_ASSERT((!PhyPosIsL1(A_TYPE::pos)), {
            KERNEL_LOG(KERNEL_ERROR, "When A is ibShare, A pos should be GM");
        });
        if (var.tiling_.GetDepthA1() < MATMUL_MODULE(KLoop)->GetTotalIter() * var.tiling_.GetStepM()) {
            // k not full load && var.tiling_.GetDepthA1() == 1
            ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported k not full load."); });
        }
    } else {
        ASCENDC_ASSERT((B_TYPE::ibShare == true),
                       { KERNEL_LOG(KERNEL_ERROR, "When A is not ibShare, B should be ibShare"); });
        ASCENDC_ASSERT((!PhyPosIsL1(B_TYPE::pos)),
                       { KERNEL_LOG(KERNEL_ERROR, "When B is ibShare, B pos should be GM"); });
        if (var.tiling_.GetDepthB1() < MATMUL_MODULE(KLoop)->GetTotalIter() * var.tiling_.GetStepN()) {
            // k not full load && var.tiling_.GetDepthB1() == 1
            ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported k not full load."); });
        }
    }

    MATMUL_MODULE(CopyCubeInA)->Init();
    MATMUL_MODULE(CopyCubeInB)->Init();
    MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, 1);
    MATMUL_MODULE(BiasScheduler)->Init();
    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))) {
        MatmulQuantProcessor::Init(var.tiling_.GetBaseN());
    }

    InitShareBufEnd(var.tpipe_);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetOrgShape(
    int orgM, int orgN, int orgK)
{
    SetOrgShape(orgM, orgN, orgK, orgK);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetOrgShape(
    int orgM, int orgN, int orgKa, int orgKb, int orgKc)
{
    ASCENDC_ASSERT((orgM > 0), { KERNEL_LOG(KERNEL_ERROR, "orgM is %d , which should be larger than 0", orgM); });
    ASCENDC_ASSERT((orgN > 0), { KERNEL_LOG(KERNEL_ERROR, "orgN is %d , which should be larger than 0", orgN); });
    ASCENDC_ASSERT((orgKa > 0), { KERNEL_LOG(KERNEL_ERROR, "orgKa is %d , which should be larger than 0", orgKa); });
    ASCENDC_ASSERT((orgKb > 0), { KERNEL_LOG(KERNEL_ERROR, "orgKb is %d , which should be larger than 0", orgKb); });
    if constexpr(ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        if (var.subBlockIdx_ == 0) {
            M_ = orgM;
            N_ = orgN;
            Ka_ = orgKa;
            Kb_ = orgKb;
            Kc_ = orgKc;
        } else {
            intraBlockMatmul.M = orgM;
            intraBlockMatmul.N = orgN;
            intraBlockMatmul.Ka = orgKa;
            intraBlockMatmul.Kb = orgKb;
            intraBlockMatmul.Kc = orgKc;
        }
    } else {
        M_ = orgM;
        N_ = orgN;
        Ka_ = orgKa;
        Kb_ = orgKb;
        Kc_ = orgKc;
    }
    return;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetSingleShape(
    int singleM, int singleN, int singleK)
{
    SetTail(singleM, singleN, singleK);
    return;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetHF32(bool enableHF32,
    int32_t transMode)
{
    ASCENDC_ASSERT((transMode == 0 || transMode == 1),
                   { KERNEL_LOG(KERNEL_ERROR, "transMode is %d , which should only be 0 / 1", transMode); });
    if (unlikely(enableHF32)) {
        SetHF32Mode(1);
    } else {
        SetHF32Mode(0);
    }
    if (unlikely(transMode == 1)) {
        SetHF32TransMode(1);
    } else {
        SetHF32TransMode(0);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetSubBlockIdx(uint8_t subBlockIdx)
{
#if __CCE_AICORE__ == 220
    ASCENDC_ASSERT((subBlockIdx < MIX_NUM),
        { KERNEL_LOG(KERNEL_ERROR, "subBlockIdx is %d , which should only be [0,%d) ", subBlockIdx, MIX_NUM); });
#endif
    var.subBlockIdx_ = subBlockIdx;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::End()
{
    if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) {
        EndNorm();
    } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        EndMDL();
    } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) {
        EndIBShareNorm();
    } else {
        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul version."); });
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::EndNorm()
{
    MATMUL_MODULE(ChosenCopyCubeInA)->Destroy();
    MATMUL_MODULE(ChosenCopyCubeInB)->Destroy();

#if __CCE_AICORE__ == 220
    if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> || IsIntrablock<MM_CFG>) {
        MATMUL_MODULE(BiasScheduler)->End();
        MATMUL_MODULE(TBufPoolL0)->ResetCache();
    } else {
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
            if (var.tiling_.IsBias()) {
                var.qidBias_.FreeAllEvent();
            }
        }
        if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
            MatmulInstr::ResetCache();
        }
    }
#else
    if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG>) {
        MATMUL_MODULE(TBufPoolL0)->ResetCache();
    }
    if constexpr (isNormDisableScheduler<A_TYPE, MM_CFG>) {
        if constexpr (!ToMatmulConfig(MM_CFG).enVecND2NZ) {
            if (var.tiling_.IsBias()) {
                var.qidBias_.FreeAllEvent();
            }
        }
    }
#endif
    MATMUL_MODULE(CubeOutBuffer)->Destroy();

    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))) {
        MatmulQuantProcessor::Destory();
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::EndMDL()
{
    MATMUL_MODULE(CopyCubeInA)->Destroy();
    MATMUL_MODULE(CopyCubeInB)->Destroy();
#if __CCE_AICORE__ == 220
    MATMUL_MODULE(BiasScheduler)->End();
#endif
    MATMUL_MODULE(TBufPoolL0)->ResetCache();
    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))) {
        MatmulQuantProcessor::Destory();
    }
    MATMUL_MODULE(CubeOutBuffer)->Destroy();
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::EndIBShareNorm()
{
    MATMUL_MODULE(CopyCubeInA)->Destroy();
    MATMUL_MODULE(CopyCubeInB)->Destroy();
    MATMUL_MODULE(BiasScheduler)->End();
    MATMUL_MODULE(CubeOutBuffer)->Destroy();
    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
        IsSameType<DstT, half>::value) ||
        (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
        IsSameType<DstT, uint8_t>::value))) {
        MatmulQuantProcessor::Destory();
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTail(
    int tailM, int tailN, int tailK)
{
    ASCENDC_ASSERT((tailM >= -1),
                   { KERNEL_LOG(KERNEL_ERROR, "tailM is %d , which should be not less than -1", tailM); });
    ASCENDC_ASSERT((tailN >= -1),
                   { KERNEL_LOG(KERNEL_ERROR, "tailN is %d , which should be not less than -1", tailN); });
    ASCENDC_ASSERT((tailK >= -1),
                   { KERNEL_LOG(KERNEL_ERROR, "tailK is %d , which should be not less than -1", tailK); });
    if constexpr (DoMatmulIBShareNorm(MM_CFG)) {
        ASCENDC_ASSERT((var.tiling_.GetSingleCoreM() >= tailM),
                    { KERNEL_LOG(KERNEL_ERROR, "tailM is %d , which should be not more than singleCoreM_", tailM); });
        ASCENDC_ASSERT((var.tiling_.GetSingleCoreN() >= tailN),
                    { KERNEL_LOG(KERNEL_ERROR, "tailN is %d , which should be not more than singleCoreN_", tailN); });
        ASCENDC_ASSERT((var.tiling_.GetSingleCoreK() >= tailK),
                    { KERNEL_LOG(KERNEL_ERROR, "tailK is %d , which should be not more than singleCoreK_", tailK); });
    }
    if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        if (var.subBlockIdx_ == 0) {
            var.singleCoreM_ = (tailM != -1) ? tailM : var.singleCoreM_;
            var.singleCoreN_ = (tailN != -1) ? tailN : var.singleCoreN_;
            var.singleCoreK_ = (tailK != -1) ? tailK : var.singleCoreK_;
            MATMUL_MODULE(MLoop)->SetSingleShape(var.singleCoreM_);
            MATMUL_MODULE(NLoop)->SetSingleShape(var.singleCoreN_);
            MATMUL_MODULE(KLoop)->SetSingleShape(var.singleCoreK_);
        } else {
            intraBlockMatmul.singleCoreM = (tailM != -1) ? tailM : intraBlockMatmul.singleCoreM;
            intraBlockMatmul.singleCoreN = (tailN != -1) ? tailN : intraBlockMatmul.singleCoreN;
            intraBlockMatmul.singleCoreK = (tailK != -1) ? tailK : intraBlockMatmul.singleCoreK;
            MATMUL_MODULE(MLoop)->SetSingleShape(intraBlockMatmul.singleCoreM);
            MATMUL_MODULE(NLoop)->SetSingleShape(intraBlockMatmul.singleCoreN);
            MATMUL_MODULE(KLoop)->SetSingleShape(intraBlockMatmul.singleCoreK);
        }
    } else {
        if ((tailM != -1) && (tailM != var.singleCoreM_)) {
            var.singleCoreM_ = tailM;
            if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                          IsBmmEnableScheduler<A_TYPE, MM_CFG> || DoMatmulSpecialMDL(MM_CFG) ||
                          IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
                MATMUL_MODULE(MLoop)->SetSingleShape(var.singleCoreM_);
            } else {
                InitStepMParams();
            }
        }
        if ((tailN != -1) && (tailN != var.singleCoreN_)) {
            var.singleCoreN_ = tailN;
            if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                          IsBmmEnableScheduler<A_TYPE, MM_CFG> || DoMatmulSpecialMDL(MM_CFG) ||
                          IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
                MATMUL_MODULE(NLoop)->SetSingleShape(var.singleCoreN_);
            } else {
                InitStepNParams();
            }
        }
        if ((tailK != -1) && (tailK != var.singleCoreK_)) {
            var.singleCoreK_ = tailK;
            if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
                MATMUL_MODULE(KLoop)->SetSingleShape(var.singleCoreK_);
                var.isA1KFullLoad_ = (var.tiling_.GetStepKa() >= MATMUL_MODULE(KLoop)->GetTotalIter());
                var.isB1KFullLoad_ = (var.tiling_.GetStepKb() >= MATMUL_MODULE(KLoop)->GetTotalIter());
            } else if constexpr(isNormEnableScheduler<A_TYPE, MM_CFG> || IsBmmEnableScheduler<A_TYPE, MM_CFG> ||
                IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
                MATMUL_MODULE(KLoop)->SetSingleShape(var.singleCoreK_);
            } else {
                InitStepKParams();
            }
        }
    }

    if constexpr (DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) {
        if constexpr (A_TYPE::format != CubeFormat::VECTOR) {
            ASCENDC_ASSERT((var.singleCoreM_ % ToMatmulConfig(MM_CFG).basicM == 0), {
                KERNEL_LOG(KERNEL_ERROR,
                    "singleCoreM is %d, basicM is %d, singleCoreM sould be a multiple of basicM in Basic Block mode.",
                    var.singleCoreM_, ToMatmulConfig(MM_CFG).basicM);
            });
        }
        ASCENDC_ASSERT((var.singleCoreN_ % ToMatmulConfig(MM_CFG).basicN == 0), {
            KERNEL_LOG(KERNEL_ERROR,
                "singleCoreN is %d, basicN is %d, singleCoreN sould be a multiple of basicN in Basic Block mode.",
                var.singleCoreN_, ToMatmulConfig(MM_CFG).basicN);
        });
    }

    CheckTiling();
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  IsBmmEnableScheduler<A_TYPE, MM_CFG> || DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> ||
                  DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
        return;
    } else {
        ASCENDC_ASSERT((var.mIter_ > 0), {
            KERNEL_LOG(KERNEL_ERROR, "invalid singleCoreM or baseM, mIter_ is %d , which should be larger than 0",
                var.mIter_);
        });
        ASCENDC_ASSERT((var.nIter_ > 0), {
            KERNEL_LOG(KERNEL_ERROR, "invalid singleCoreN or baseN, nIter_ is %d , which should be larger than 0",
                var.nIter_);
        });
        ASCENDC_ASSERT((var.kIter_ > 0), {
            KERNEL_LOG(KERNEL_ERROR, "invalid singleCoreK or baseK, kIter_ is %d , which should be larger than 0",
                var.kIter_);
        });
        return;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorA(
    const GlobalTensor<SrcAT>& gm, bool isTransposeA)
{
    ASCENDC_ASSERT((isTransposeA <= A_TYPE::isTrans), {
        KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do A transpose when matmul A transpose is not defined.");
    });
#if __CCE_AICORE__ == 220
    if constexpr (IsSameType<SrcT, int4b_t>::value) {
        ASCENDC_ASSERT(!isTransposeA, { KERNEL_LOG(KERNEL_ERROR,
            "When matrix A DType is int4, matrix A should not be transposed");});
    }
#elif __CCE_AICORE__ == 200
    if constexpr (IsSameType<SrcT, int8_t>::value) {
        ASCENDC_ASSERT(!isTransposeA, { KERNEL_LOG(KERNEL_ERROR,
            "When matrix A DType is int8, matrix A should not be transposed");});
    }
#endif
    MATMUL_MODULE(ChosenCopyCubeInA)->SetInput(gm, isTransposeA);
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->Reset();
    } else if constexpr (DoMatmulNorm(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> ||
                      DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
            MATMUL_MODULE(Scheduler)->Reset();
        } else if constexpr (!IsBmmEnableScheduler<A_TYPE, MM_CFG>) {
            IterateController::Reset();
        }
    } else {
        var.isFirstIter_ = true;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorA(
    const LocalTensor<SrcAT>& leftMatrix, bool isTransposeA)
{
    ASCENDC_ASSERT((isTransposeA <= A_TYPE::isTrans), {
        KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do A transpose when matmul A transpose is not defined.");
    });
#if __CCE_AICORE__ == 220
    if constexpr (IsSameType<SrcT, int4b_t>::value) {
        ASCENDC_ASSERT(!isTransposeA, { KERNEL_LOG(KERNEL_ERROR,
            "When matrix A DType is int4, matrix A should not be transposed");});
    }
#elif __CCE_AICORE__ == 200
    if constexpr (IsSameType<SrcT, int8_t>::value) {
        ASCENDC_ASSERT(!isTransposeA, { KERNEL_LOG(KERNEL_ERROR,
            "When matrix A DType is int8, matrix A should not be transposed");});
    }
#endif
    // A/B does not come from GM with IBShare is not support
    if constexpr (DoMatmulIBShareNorm(MM_CFG) && A_TYPE::ibShare) {
        ASCENDC_ASSERT((false), {
            KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do A whose src::pos is L1 when matmul A is ibShare.");
        });
    }
    MATMUL_MODULE(CopyCubeInA)->SetInput(leftMatrix, isTransposeA);
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->Reset();
    } else if constexpr (DoMatmulNorm(MM_CFG) || IsBasicBlockEnable<MM_CFG>) {
        if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> ||
                      DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
            MATMUL_MODULE(Scheduler)->Reset();
        } else if constexpr (!IsBmmEnableScheduler<A_TYPE, MM_CFG>) {
            IterateController::Reset();
        }
    } else {
        var.isFirstIter_ = true;
    }
}

#if __CCE_AICORE__ >= 220
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorA(SrcAT aScalar)
{
    // A/B does not come from GM with IBShare is not support
    if constexpr (DoMatmulIBShareNorm(MM_CFG) && A_TYPE::ibShare) {
        ASCENDC_ASSERT((false), {
            KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do A in scaler scene when matmul A is ibShare.");
        });
    }
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  IsBmmEnableScheduler<A_TYPE, MM_CFG> || DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> ||
                  DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
        MATMUL_MODULE(LoadToA2)->SetScalar(aScalar);
    } else {
        MatmulInstr::aScalar_ = aScalar;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorB(SrcBT bScalar)
{
    // A/B does not come from GM with IBShare is not support
    if constexpr (DoMatmulIBShareNorm(MM_CFG) && B_TYPE::ibShare) {
        ASCENDC_ASSERT((false), {
            KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do B in scaler scene when matmul B is ibShare.");
        });
    }
    if constexpr (isNormDisableScheduler<A_TYPE, MM_CFG>) {
        MatmulInstr::bScalar_ = bScalar;
    }
}
#endif

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorAWithCopy(
    const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT>& leftMatrix, bool isTransposeA)
{
#if (__CCE_AICORE__ < 220)
    event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
    SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
    WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
    struct DataCopyParams param;
    param.blockLen = leftMatrix.GetSize() / AscendCUtils::GetC0Count(sizeof(SrcT));
    DataCopy(gm, leftMatrix, param);
    SetTensorA(gm, isTransposeA);
#else
    ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "not supported on Ascend910B1."); });
#endif
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorBWithCopy(
    const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT>& rightMatrix, bool isTransposeB)
{
#if (__CCE_AICORE__ < 220)
    event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
    SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
    WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
    struct DataCopyParams param;
    param.blockLen = rightMatrix.GetSize() / AscendCUtils::GetC0Count(sizeof(SrcBT));
    DataCopy(gm, rightMatrix, param);
    SetTensorB(gm, isTransposeB);
#else
    ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "not supported on Ascend910B1."); });
#endif
}


template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorB(
    const GlobalTensor<SrcBT>& gm, bool isTransposeB)
{
    ASCENDC_ASSERT((isTransposeB <= B_TYPE::isTrans), {
        KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do B transpose when matmul B transpose is not defined.");
    });

    MATMUL_MODULE(ChosenCopyCubeInB)->SetInput(gm, isTransposeB);

    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->Reset();
    } else if constexpr (DoMatmulNorm(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> ||
                      DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
            MATMUL_MODULE(Scheduler)->Reset();
        } else if constexpr (!IsBmmEnableScheduler<A_TYPE, MM_CFG>) {
            IterateController::Reset();
        }
    } else {
        var.isFirstIter_ = true;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetTensorB(
    const LocalTensor<SrcBT>& rightMatrix, bool isTransposeB)
{
    ASCENDC_ASSERT((isTransposeB <= B_TYPE::isTrans), {
        KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do B transpose when matmul B transpose is not defined.");
    });
    // A/B does not come from GM with IBShare is not support
    if constexpr (DoMatmulIBShareNorm(MM_CFG) && B_TYPE::ibShare) {
        ASCENDC_ASSERT((false), {
            KERNEL_LOG(KERNEL_ERROR, "It is not allowed to do B whose src::pos is L1 when matmul B is ibShare.");
        });
    }
    MATMUL_MODULE(CopyCubeInB)->SetInput(rightMatrix, isTransposeB);
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->Reset();
    } else if constexpr (DoMatmulNorm(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> ||
                      DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
            MATMUL_MODULE(Scheduler)->Reset();
        } else if constexpr (!IsBmmEnableScheduler<A_TYPE, MM_CFG>) {
            IterateController::Reset();
        }
    } else {
        var.isFirstIter_ = true;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
          MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetBias(
    const GlobalTensor<BiasT>& biasGlobal)
{
    ASCENDC_ASSERT((var.tiling_.IsBias()), {
            KERNEL_LOG(KERNEL_ERROR, "var.tiling_.IsBias() is %d, which should be true when SetBias.", var.tiling_.IsBias());
        });
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG) ||
                  IsIntrablock<MM_CFG>) {
        MATMUL_MODULE(BiasScheduler)->SetInput(biasGlobal);
        MATMUL_MODULE(BiasScheduler)->SetBias(true);
        MATMUL_MODULE(Scheduler)->Reset();
    } else if constexpr (IsBmmEnableScheduler<A_TYPE, MM_CFG>) {
        MATMUL_MODULE(BiasScheduler)->SetInput(biasGlobal);
        MATMUL_MODULE(BiasScheduler)->SetBias(true);
    } else {
        var.biasGlobal_ = biasGlobal.address_;
        var.enableBias_ = true;
        if constexpr (DoMatmulNorm(MM_CFG)) {
            IterateController::Reset();
        } else {
            var.isFirstIter_ = true;
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
          MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetBias(
    const LocalTensor<BiasT>& inputBias)
{
    ASCENDC_ASSERT((var.tiling_.IsBias()), { KERNEL_LOG(KERNEL_ERROR,
            "var.tiling_.IsBias() is %d, which should be true when SetBias.", var.tiling_.IsBias()); });
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG) ||
                  IsIntrablock<MM_CFG>) {
        MATMUL_MODULE(BiasScheduler)->SetInput(inputBias);
        MATMUL_MODULE(BiasScheduler)->SetBias(true);
        MATMUL_MODULE(Scheduler)->Reset();
    } else if constexpr (IsBmmEnableScheduler<A_TYPE, MM_CFG>) {
        MATMUL_MODULE(BiasScheduler)->SetInput(inputBias);
        MATMUL_MODULE(BiasScheduler)->SetBias(true);
    } else {
        var.inputBias_.address_ = inputBias.address_;
        var.enableBias_ = true;
        if constexpr (DoMatmulNorm(MM_CFG)) {
            IterateController::Reset();
        } else {
            var.isFirstIter_ = true;
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::SetBatchNum(int32_t batchA,
    int32_t batchB)
{
    MATMUL_MODULE(BatchLoop)->SetBatchNum(batchA, batchB);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::DisableBias()
{
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  IsBmmEnableScheduler<A_TYPE, MM_CFG> || DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> ||
                  DoMatmulIBShareNorm(MM_CFG) || IsIntrablock<MM_CFG>) {
        MATMUL_MODULE(BiasScheduler)->SetBias(false);
    } else {
        var.enableBias_ = false;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ClearBias()
{
    DisableBias();
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool isTurnOnDebug>
__aicore__ inline MatrixOffset MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetOffsetC()
{
    if constexpr (isTurnOnDebug) {
        static_assert(!isTurnOnDebug, "unsupported!");
    }
}

#if __CCE_AICORE__ < 220
// v100, v200
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorC(
    const LocalTensor<DstT>& co2Local, uint8_t enAtomic, bool enSequentialWrite)
{
    static_assert(ToMatmulConfig(MM_CFG).scheduleType != ScheduleType::OUTER_PRODUCT, "Unsupported scheduleType");
    GetTensorCImpl(co2Local, enAtomic, enSequentialWrite);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCImpl(
    const LocalTensor<DstT>& co2Local, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(co2Local, enAtomic, enSequentialWrite);
    } else {
        (void)(enAtomic);
        auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor();
        MATMUL_MODULE(CubeOutBuffer)->EnQue(co1Local);
        MATMUL_MODULE(CubeOutBuffer)->DeQue();
        if (enSequentialWrite) {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<true>(co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
                                        var.blockUseM_, var.blockUseN_);
        } else {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<false>(co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
                                        var.blockUseM_, var.blockUseN_);
        }
        MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local);
    }
}

// v100, v200
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorC(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(gm, enAtomic, enSequentialWrite);
    } else {
        static_assert(ToMatmulConfig(MM_CFG).scheduleType != ScheduleType::OUTER_PRODUCT, "Unsupported scheduleType");
        GetTensorCImpl(gm, enAtomic, enSequentialWrite);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCImpl(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
        IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(gm, enAtomic, enSequentialWrite);
    } else {
        auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor();
        MATMUL_MODULE(CubeOutBuffer)->EnQue(co1Local);
        MATMUL_MODULE(CubeOutBuffer)->DeQue();
        if (enAtomic == 1) {
            SetAtomicAdd<DstT>();
        }

        if (enSequentialWrite) {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<true>(gm, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_, var.blockUseM_,
                                        var.blockUseN_);
        } else {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<false>(gm, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_, var.blockUseM_,
                                        var.blockUseN_);
        }
        if (enAtomic != 0) {
            SetAtomicNone();
        }
        MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local);
    }
}

// v100, v200
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorC(
    const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(gm, co2Local, enAtomic, enSequentialWrite);
    } else {
        static_assert(ToMatmulConfig(MM_CFG).scheduleType != ScheduleType::OUTER_PRODUCT, "Unsupported scheduleType");
        GetTensorCImpl(gm, co2Local, enAtomic, enSequentialWrite);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCImpl(
    const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(gm, co2Local, enAtomic, enSequentialWrite);
    } else {
        auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor();
        MATMUL_MODULE(CubeOutBuffer)->EnQue(co1Local);
        MATMUL_MODULE(CubeOutBuffer)->DeQue();
        if (enAtomic == 1) {
            SetAtomicAdd<DstT>();
        }

        if (enSequentialWrite) {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<true>(gm, co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
                                    var.blockUseM_, var.blockUseN_);
        } else {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<false>(gm, co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
                                    var.blockUseM_, var.blockUseN_);
        }

        if (enAtomic != 0) {
            SetAtomicNone();
        }
        MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local);
    }
}

// v100, v200
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::LoadBias(
    const LocalTensor<L0cT>& cMatrix, int col)
{
    LocalTensor<BiasT> bias;
    if constexpr (PhyPosIsUB(BIAS_TYPE::pos)) {
        bias.SetAddr(var.inputBias_.address_);
        bias = bias[col * var.tiling_.GetBaseN()];
    } else if constexpr (PhyPosIsGM(BIAS_TYPE::pos)) {
        GlobalTensor<BiasT> biasGlobal;
        biasGlobal.SetGlobalBuffer(var.biasGlobal_);
        bias = MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset(0).template ReinterpretCast<BiasT>();
        bias.SetSize(var.tiling_.GetBaseN() * sizeof(BiasT));
        if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            event_t eventIDMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
            SetFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
            WaitFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
        }
        DataCopy(bias, biasGlobal[col * var.tiling_.GetBaseN()], var.blockUseN_ * BLOCK_CUBE);
        event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
        SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
    } else {
        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "bias pos only can be ub or gm."); });
    }

    if (var.blockUseN_ <= MAX_REPEAT_TIMES) {
        for (int i = 0; i < var.blockUseM_; ++i) {
            BroadCastVecToMM(cMatrix[i * CUBE_MAX_SIZE], bias, var.blockUseN_, 1, 0, var.blockUseM_ - 1);
        }
    } else {
        int32_t loop = var.blockUseN_ / MAX_REPEAT_TIMES;
        int32_t loopTail = var.blockUseN_ % MAX_REPEAT_TIMES;
        for (int32_t i = 0; i < var.blockUseM_; ++i) {
            for (int32_t idx = 0; idx < loop; ++idx) {
                BroadCastVecToMM(cMatrix[i * MAX_REPEAT_TIMES * CUBE_MAX_SIZE + idx  * var.blockUseM_ * CUBE_MAX_SIZE],
                    bias[idx * BLOCK_CUBE], MAX_REPEAT_TIMES, 1, 0, var.blockUseM_ - 1);
            }
            if (loopTail) {
                BroadCastVecToMM(cMatrix[i * MAX_REPEAT_TIMES * CUBE_MAX_SIZE + loop * var.blockUseM_ * CUBE_MAX_SIZE],
                    bias[loop * BLOCK_CUBE], loopTail, 1, 0, var.blockUseM_ - 1);
            }
        }
    }


    // The L0C waits for the completion of the UB copy.
    event_t eventIDVToM = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_M));
    SetFlag<HardEvent::V_M>(eventIDVToM);
    WaitFlag<HardEvent::V_M>(eventIDVToM);
}
#else
// v220, only for compilation without kfc
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorC(
    const LocalTensor<DstT>& co2Local, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(co2Local, enAtomic, enSequentialWrite);
    } else {
        GetTensorCImpl(co2Local, enAtomic, enSequentialWrite);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCImpl(
    const LocalTensor<DstT>& co2Local, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> ||
                  IsBasicBlockEnable<MM_CFG> || DoMatmulIBShareNorm(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(co2Local, enAtomic, enSequentialWrite);
    } else {
        (void)(enAtomic);
        auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor();
        MATMUL_MODULE(CubeOutBuffer)->EnQue(co1Local);
        MATMUL_MODULE(CubeOutBuffer)->DeQue();
        if (enSequentialWrite) {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<true>(co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
                                        var.blockUseM_, var.blockUseN_);
        } else {
            MATMUL_MODULE(CopyCubeOut)
                ->template Copy<false>(co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
                                        var.blockUseM_, var.blockUseN_);
        }
        MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::FixpipeL0CToGm(
    const GlobalTensor<DstT> &gm, const LocalTensor<L0cT> &co1Local, int curM, int curN, uint8_t enAtomic, bool enSequentialWrite)
{
    if (enAtomic == 1) {
        SetAtomicAdd<DstT>();
    } else if (enAtomic == 2) {
        SetAtomicMax<DstT>();
    } else if (enAtomic == 3) {
        SetAtomicMin<DstT>();
    }

    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> ||
        DoMatmulIBShareNorm(MM_CFG)) {
        if (enSequentialWrite) {
            MATMUL_MODULE(CopyCubeOut)->template Copy<true>(gm, co1Local, curM, curN,
                MATMUL_MODULE(MLoop)->GetBaseShape(), MATMUL_MODULE(NLoop)->GetBaseShape(),
                MATMUL_MODULE(MLoop)->GetBaseBlockShape(), MATMUL_MODULE(NLoop)->GetBaseBlockShape());
        } else {
            MATMUL_MODULE(CopyCubeOut)->template Copy<false>(gm, co1Local, curM, curN,
                MATMUL_MODULE(MLoop)->GetBaseShape(), MATMUL_MODULE(NLoop)->GetBaseShape(),
                MATMUL_MODULE(MLoop)->GetBaseBlockShape(), MATMUL_MODULE(NLoop)->GetBaseBlockShape());
        }
    } else {
        if (enSequentialWrite) {
            MATMUL_MODULE(CopyCubeOut)->template Copy<true>(gm, co1Local, curM, curN,
                var.baseUseM_, var.baseUseN_, var.blockUseM_, var.blockUseN_);
        } else {
            MATMUL_MODULE(CopyCubeOut)->template Copy<false>(gm, co1Local, curM, curN,
                var.baseUseM_, var.baseUseN_, var.blockUseM_, var.blockUseN_);
        }
    }
    if (enAtomic != 0) {
        SetAtomicNone();
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::FixpipeOutToGm(
    const GlobalTensor<DstT>& gm, const LocalTensor<L0cT> &co1Local, int curM, int curN, uint8_t enAtomic,
    bool enSequentialWrite)
{
    FixpipeL0CToGm(gm, co1Local, curM, curN, enAtomic, enSequentialWrite);
}

// v220
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorC(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite)
{
    GetTensorCImpl(gm, enAtomic, enSequentialWrite);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCImpl(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite)
{
    if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(gm, enAtomic, enSequentialWrite);
        return;
    }

    if constexpr (DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> || 
        DoMatmulIBShareNorm(MM_CFG)) {
        MATMUL_MODULE(Scheduler)->GetResult(gm, enAtomic, enSequentialWrite);
        return;
    } else {
        LocalTensor<uint64_t> l1TmpForQuant;
        // remove dependency conflicts only for scene which is not db
        auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor();
        MATMUL_MODULE(CubeOutBuffer)->EnQue(co1Local);
        MATMUL_MODULE(CubeOutBuffer)->DeQue();
        // MDL L0 MNDB use Scheduler->GetResult, Norm L0 MNDB wait to modify
        if constexpr (DoMatmulNorm(MM_CFG) && ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
            if (var.sMadNStep_ > var.tiling_.GetBaseN()) { // Means L0 N db, need to excute twice FixpipeOutToGm
                FixpipeOutToGm(gm, co1Local, var.curM_, var.curN_, enAtomic, enSequentialWrite);
                var.baseUseN_ = (var.curN_ + 2 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN(); // update next var.curN_ baseUseN_
                var.blockUseN_ = Ceil(var.baseUseN_, BLOCK_CUBE);
                FixpipeOutToGm(gm, co1Local[var.tiling_.GetBaseM() * var.tiling_.GetBaseN()], var.curM_, var.curN_ + 1, enAtomic,
                    enSequentialWrite);
            } else if (var.sMadMStep_ > var.tiling_.GetBaseM()) { // Means L0 M db, need to excute twice FixpipeOutToGm
                FixpipeOutToGm(gm, co1Local, var.curM_, var.curN_, enAtomic, enSequentialWrite);
                var.baseUseM_ = (var.curM_ + 2 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM(); // update next var.curM_ baseUseM_
                var.blockUseM_ = Ceil(var.baseUseM_, BLOCK_CUBE);
                FixpipeOutToGm(gm, co1Local[var.tiling_.GetBaseM() * var.tiling_.GetBaseN()], var.curM_ + 1, var.curN_, enAtomic,
                    enSequentialWrite);
            } else {
                FixpipeOutToGm(gm, co1Local, var.curM_, var.curN_, enAtomic, enSequentialWrite);
            }

        } else {
            if constexpr (!(DoMatmulMDL(MM_CFG) || isNormEnableScheduler<A_TYPE, MM_CFG> || IsBasicBlockEnable<MM_CFG> || 
                DoMatmulIBShareNorm(MM_CFG))) {
                FixpipeL0CToGm(gm, co1Local, var.curM_, var.curN_, enAtomic, enSequentialWrite);
            }
        }
        MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local);
    }
}

// v220
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::LoadBias(
    const LocalTensor<L0cT>& cMatrix, int col)
{
    if constexpr (A_TYPE::layout == LayoutMode::NONE || ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
#if __CCE_AICORE__ >= 300
        auto bias = var.qidBias_.template AllocTensor<BiasT>();
        if constexpr (PhyPosIsUB(BIAS_TYPE::pos)) {
            LocalTensor<BiasT> biasLocal;
            biasLocal.SetAddr(var.inputBias_.address_);
            DataCopy(bias, biasLocal[col * var.tiling_.GetBaseN()],
                { (uint16_t)1, (uint16_t)(var.blockUseN_ * BLOCK_CUBE / AscendCUtils::GetC0Count(sizeof(BiasT))),
                (uint16_t)0, (uint16_t)0 });
        } else if constexpr (PhyPosIsGM(BIAS_TYPE::pos)) {
            GlobalTensor<BiasT> biasGlobal;
            biasGlobal.SetGlobalBuffer(var.biasGlobal_);
            DataCopy(bias, biasGlobal[col * var.tiling_.GetBaseN()],
                { (uint16_t)1, (uint16_t)(var.blockUseN_ * BLOCK_CUBE / AscendCUtils::GetC0Count(sizeof(BiasT))),
                (uint16_t)0, (uint16_t)0 });
        } else {
            ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "bias pos only can be ub or gm."); });
        }
        var.qidBias_.EnQue(bias);
#else
        GlobalTensor<BiasT> biasGlobal;
        biasGlobal.SetGlobalBuffer(var.biasGlobal_);
        LoadBias(biasGlobal, cMatrix, col);
#endif
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::LoadBias(
    GlobalTensor<BiasT>& biasGlobal, const LocalTensor<L0cT>& cMatrix, int col)
{
    auto bias = var.qidBias_.template AllocTensor<BiasT>();
    // if var.baseUseN_ is not 32B align, use DataCopy Nd2Nz
    if ((var.baseUseN_ * sizeof(BiasT)) % ONE_BLK_SIZE != 0) {
        DataCopy(bias, biasGlobal[col * var.tiling_.GetBaseN()], { 1, 1, (uint16_t)var.baseUseN_, 0, 1, 1, 1, 0 });
    } else {
        auto blockLen = Ceil(var.baseUseN_ * sizeof(BiasT), ONE_BLK_SIZE);
        if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT && ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
            if (var.nIter_ % 2 == 0 && var.tailN_ == var.tiling_.GetBaseN()) {
                blockLen = 2 * blockLen;
            } else {
                if (var.curN_ < var.nIter_ - 2) {
                    blockLen = 2 * blockLen;
                } else {
                    if constexpr (DoMatmulNorm(MM_CFG) && A_TYPE::layout == LayoutMode::NONE) {
                        if (var.nIter_ % 2 == 0) {
                            blockLen = var.baseUseN_ + var.tailN_;
                            DataCopy(bias, biasGlobal[col * var.tiling_.GetBaseN()], { 1, 1, (uint16_t)blockLen, 0, 1, 1, 1, 0 });
                            var.qidBias_.EnQue(bias);
                            return;
                        }
                    }
                    blockLen = 1 * blockLen;
                }
            }
        }
        DataCopy(bias, biasGlobal[col * var.tiling_.GetBaseN()],
            { (uint16_t)1, (uint16_t)blockLen, (uint16_t)0, (uint16_t)0 });
    }
    // delete after tpipe supports bias queue
    var.qidBias_.EnQue(bias);
}
#endif

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::LoadC(bool enPartialSum)
{
    if (enPartialSum) {
        ASCENDC_ASSERT((var.calCount_ > 0), {
            KERNEL_LOG(KERNEL_ERROR, "var.calCount_ is %d, which should be larger than 0.", var.calCount_);
        });
        return;
    }
    MATMUL_MODULE(CubeOutBuffer)->AllocTensor();
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::OnLoadInA2(
    const LocalTensor<SrcT>& dst, const LocalTensor<SrcT>& aMatrix)
{
    if constexpr (A_TYPE::format == CubeFormat::VECTOR) {
        LoadData2dParams loadDataParams;
        loadDataParams.repeatTimes = Ceil(var.baseUseK_, BYTE_PER_FRACTAL / sizeof(SrcT));
        loadDataParams.dstGap = 0;
        loadDataParams.srcStride = 1;
        LoadData(dst, aMatrix, loadDataParams);
        return;
    }
    if (var.isTransposeA_) {
        if constexpr (sizeof(SrcT) == sizeof(float)) {
            // only support v220
            uint16_t cubeKSize = Ceil(var.baseUseK_, BLOCK_CUBE) * BLOCK_CUBE;
            LoadData3DParamsV2<SrcT> loadData3dParams;
            if constexpr (PhyPosIsL1(A_TYPE::pos)) {
                loadData3dParams.l1H = var.singleCoreK_;
            } else {
                loadData3dParams.l1H = cubeKSize;
            }
            loadData3dParams.l1W = 1;
            loadData3dParams.channelSize = var.blockUseM_ * BLOCK_CUBE;
            loadData3dParams.kExtension = var.blockUseM_ * BLOCK_CUBE;
            loadData3dParams.mExtension = cubeKSize;
            loadData3dParams.kStartPt = 0;
            loadData3dParams.mStartPt = 0;
            loadData3dParams.strideW = 1;
            loadData3dParams.strideH = 1;
            loadData3dParams.filterW = 1;
            loadData3dParams.filterH = 1;
            loadData3dParams.dilationFilterW = 1;
            loadData3dParams.dilationFilterH = 1;
            loadData3dParams.enTranspose = true;
            loadData3dParams.enSmallK = false;
            loadData3dParams.padValue = 0;
            LoadData(dst, aMatrix, loadData3dParams);
        } else {
            LoadData2dParams loadDataParams;
            int dstOffset = var.blockUseK_ * CUBE_MAX_SIZE / factor_;
            int srcOffset = var.singleCoreK_ * c0Size_;
            if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                srcOffset = var.blockUseK_ * c0Size_ * BLOCK_CUBE;
            }
            loadDataParams.repeatTimes = var.blockUseK_;
            loadDataParams.srcStride = 1;
            loadDataParams.ifTranspose = true;

            if (var.blockUseK_ == 1) {
                loadDataParams.repeatTimes = var.blockUseM_;
                loadDataParams.srcStride = 1;
                LoadData(dst, aMatrix, loadDataParams);
            } else {
                for (int i = 0; i < var.blockUseM_; i++) {
                    LoadData(dst[i * dstOffset], aMatrix[i * srcOffset], loadDataParams);
                }
            }
        }
    } else {
        LoadData2dParams loadDataParams;
        int dstOffset = var.blockUseK_ * CUBE_MAX_SIZE / factor_;
        int srcOffset = CUBE_MAX_SIZE / factor_;
#if __CCE_AICORE__ == 200
        if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
            dstOffset *= 2;
            srcOffset *= 2;
        }
#endif
        loadDataParams.repeatTimes = var.blockUseK_;
        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
            // alL A matrix is in L1 buffer
            loadDataParams.srcStride = Ceil(var.singleCoreM_, BLOCK_CUBE);
        } else {
            loadDataParams.srcStride = var.blockUseM_;
        }
        loadDataParams.ifTranspose = false;

        if (var.blockUseK_ == 1) {
            loadDataParams.repeatTimes = var.blockUseM_;
            loadDataParams.srcStride = 1;
            LoadData(dst, aMatrix, loadDataParams);
        } else {
            for (int i = 0; i < var.blockUseM_; i++) {
                LoadData(dst[i * dstOffset], aMatrix[i * srcOffset], loadDataParams);
            }
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::OnLoadInB2(
    const LocalTensor<SrcT>& dst, const LocalTensor<SrcT>& bMatrix)
{
    if (var.isTransposeB_) {
        LoadData2dParams loadDataParams;
        int dstOffset = var.blockUseN_ * CUBE_MAX_SIZE / factor_;
        int srcOffset = var.singleCoreN_ * c0Size_;
#if __CCE_AICORE__ == 200
        if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
            dstOffset *= 2;
        }
#endif
        if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
            srcOffset = var.blockUseN_ * BLOCK_CUBE * c0Size_;
        }
        loadDataParams.repeatTimes = var.blockUseN_;
        loadDataParams.srcStride = 1;
        loadDataParams.ifTranspose = false;

        if (var.blockUseN_ == 1) {
            loadDataParams.repeatTimes = var.blockUseK_;
            loadDataParams.srcStride = 1;
            LoadData(dst, bMatrix, loadDataParams);
        } else {
            for (int i = 0; i < var.blockUseK_; i++) {
                LoadData(dst[i * dstOffset], bMatrix[i * srcOffset], loadDataParams);
            }
        }
    } else {
        if constexpr (sizeof(SrcT) == sizeof(float)) {
            // only support v220
            uint16_t cubeKSize = Ceil(var.baseUseK_, BLOCK_CUBE) * BLOCK_CUBE;
            LoadData3DParamsV2<SrcT> loadData3dParams;
            if constexpr (PhyPosIsL1(B_TYPE::pos)) {
                loadData3dParams.l1H = var.singleCoreK_;
            } else {
                loadData3dParams.l1H = cubeKSize;
            }
            loadData3dParams.l1W = 1;
            loadData3dParams.channelSize = var.blockUseN_ * BLOCK_CUBE;
            loadData3dParams.kExtension = var.blockUseN_ * BLOCK_CUBE;
            loadData3dParams.mExtension = cubeKSize;
            loadData3dParams.kStartPt = 0;
            loadData3dParams.mStartPt = 0;
            loadData3dParams.strideW = 1;
            loadData3dParams.strideH = 1;
            loadData3dParams.filterW = 1;
            loadData3dParams.filterH = 1;
            loadData3dParams.dilationFilterW = 1;
            loadData3dParams.dilationFilterH = 1;
            loadData3dParams.enTranspose = true;
            loadData3dParams.enSmallK = false;
            loadData3dParams.padValue = 0;
            LoadData(dst, bMatrix, loadData3dParams);
        } else {
            LoadData2dParams loadDataParams;
            int dstOffset = var.blockUseN_ * CUBE_MAX_SIZE;
            constexpr int srcOffset = CUBE_MAX_SIZE;
            loadDataParams.repeatTimes = var.blockUseN_;
            if constexpr (PhyPosIsL1(B_TYPE::pos)) {
                // alL B matrix is in L1 buffer
                loadDataParams.srcStride = Ceil(var.singleCoreK_, BLOCK_CUBE);
            } else {
                loadDataParams.srcStride = var.blockUseK_;
            }
            loadDataParams.ifTranspose = true;
            if (var.blockUseN_ == 1) {
                loadDataParams.repeatTimes = var.blockUseK_;
                loadDataParams.srcStride = 1;
                LoadData(dst, bMatrix, loadDataParams);
            } else {
                for (int i = 0; i < var.blockUseK_; i++) {
                    LoadData(dst[i * dstOffset], bMatrix[i * srcOffset], loadDataParams);
                }
            }
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::Compute(bool enPartialSum)
{
    if constexpr (DoMatmulNorm(MM_CFG)) {
        ComputeNorm(enPartialSum);
    } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) {
        ComputeIBShareNorm(enPartialSum);
    } else if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        ComputeSpecialMDL(enPartialSum);
    } else {
        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul version."); });
    }
}

#if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 200 || __CCE_AICORE__ == 300
// v220 v200 v300
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNorm(bool enPartialSum)
{
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.enableBias_) {
            LoadBias(MATMUL_MODULE(CubeOutBuffer)->GetTensor(), var.curN_);
        }
    }
    if constexpr (IsStaticPaddingEnable(MM_CFG)) {
        MatmulInstr::sAL1M_ = Ceil(var.tiling_.GetBaseM(), BLOCK_CUBE) * BLOCK_CUBE;
        MatmulInstr::sBL1N_ = Ceil(var.tiling_.GetBaseN(), BLOCK_CUBE) * BLOCK_CUBE;
        MatmulInstr::sMadM_ = var.tiling_.GetBaseM();
        MatmulInstr::sMadN_ = var.tiling_.GetBaseN();
        MatmulInstr::sAL1K_ = Ceil(var.tiling_.GetBaseK(), c0Size_) * c0Size_;
        MatmulInstr::sBL1K_ = Ceil(var.tiling_.GetBaseK(), c0Size_) * c0Size_;
        MatmulInstr::sMadK_ = var.tiling_.GetBaseK();
        MatmulInstr::sMad0K_ = var.tiling_.GetBaseK(); // split K value
    } else {
        MatmulInstr::sAL1M_ = var.blockUseM_ * BLOCK_CUBE;
        MatmulInstr::sBL1N_ = var.blockUseN_ * BLOCK_CUBE;
        MatmulInstr::sMadM_ = var.baseUseM_;
        MatmulInstr::sMadN_ = var.baseUseN_;
    }
#if __CCE_AICORE__ == 200
    if constexpr (A_TYPE::format == CubeFormat::SCALAR || A_TYPE::format == CubeFormat::VECTOR) {
        // VECTOR support GEMV
        MatmulInstr::isGemv_ = 1;
        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
            MatmulInstr::isScalar_ = 1;
        }
    }
#endif
    MatmulInstr::ssAmatrixTranspose_ = var.isTransposeA_;
#if __CCE_AICORE__ == 200
    if constexpr (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename B_TYPE::T, int8_t>::value) {
        if (!var.isTransposeB_) {
            MatmulInstr::ssBmatrixTranspose_ = true;
        } else {
            MatmulInstr::ssBmatrixTranspose_ = var.isTransposeB_;
        }
    } else {
        MatmulInstr::ssBmatrixTranspose_ = var.isTransposeB_;
    }
#else
    MatmulInstr::ssBmatrixTranspose_ = var.isTransposeB_;
#endif
    if constexpr (IsStaticTilingEnable(MM_CFG)) {
        var.kIter_ = GetKIter(MM_CFG);
        MatmulInstr::useL0PingPong_ = GetL0PingPong(MM_CFG);
    } else {
        MatmulInstr::useL0PingPong_ = (var.tiling_.GetDbL0A() - 1) & (var.tiling_.GetDbL0B() - 1);
    }
    MatmulInstr::sAL1MOffset_ = 0;
    MatmulInstr::sAL1KOffset_ = 0;
    MatmulInstr::sBL1NOffset_ = 0;
    MatmulInstr::sBL1KOffset_ = 0;
    LocalTensor<SrcAT> a1;
    LocalTensor<SrcBT> b1;
    LocalTensor<BiasT> bias;

    if constexpr (IsBasic(MM_CFG)) {
        if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) {
            a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(0, 0, var.tailM_, var.tailK_);
            if constexpr (!ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tailK_, var.tailN_);
            } else {
                if (intraBlockMatmul.fakeMsg) {
                    b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tailK_, var.tailN_);
                }
            }
            var.baseUseK_ = var.tailK_;
            var.blockUseK_ = Ceil(var.baseUseK_, c0Size_);
        } else {
            var.baseUseK_ = var.tiling_.GetBaseK();
            var.blockUseK_ = Ceil(var.baseUseK_, c0Size_);
            a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseK());
            if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tiling_.GetBaseK(), var.tiling_.GetBaseN());
            } else if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                if (intraBlockMatmul.fakeMsg) {
                    b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tiling_.GetBaseK(), var.tiling_.GetBaseN());
                }
            }
        }
        if constexpr (!IsStaticPaddingEnable(MM_CFG)) {
            // set addr
            MatmulInstr::sAL1K_ = var.blockUseK_ * c0Size_;
            MatmulInstr::sBL1K_ = var.blockUseK_ * c0Size_;
            MatmulInstr::sMadK_ = var.baseUseK_;
            MatmulInstr::sMad0K_ = var.baseUseK_; // split K value
        }
        if constexpr (PhyPosIsL1(A_TYPE::pos) || (A_TYPE::layout != LayoutMode::NONE &&
            ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) {
            MatmulInstr::sAL1MOffset_ = 0;
            MatmulInstr::sAL1KOffset_ = 0;
            MatmulInstr::sAL1M_ = Ceil(var.singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE;
            if (var.isTransposeA_) {
                MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
            } else {
                MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
            }
        }
        if constexpr (PhyPosIsL1(B_TYPE::pos) || (B_TYPE::layout != LayoutMode::NONE &&
            ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) {
            MatmulInstr::sBL1NOffset_ = 0;
            MatmulInstr::sBL1KOffset_ = 0;
            MatmulInstr::sBL1N_ = Ceil(var.singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE;
            if (var.isTransposeB_) {
                MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
            } else {
                MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
            }
        }
        // set flag
        // This flag needs to be set to 0 only when the outer axis is cut to K.
        // Currently, all K processed at a time.
        MatmulInstr::sL0cInit_ = enPartialSum ? 0 : 1;
#if __CCE_AICORE__ >= 220
        if constexpr (EnUnitFlag(MM_CFG)) {
            if constexpr (!ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                MatmulInstr::sL0cLast_ = 1;
            } else {
                if (intraBlockMatmul.fakeMsg) {
                    MatmulInstr::sL0cLast_ = 1;
                }
            }
        }
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
            if (var.enableBias_) {
                if constexpr (A_TYPE::layout == LayoutMode::NONE || ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
                    // In multiple batch, the L1 cache is used to offset the memory inputBias_.
                    bias = var.qidBias_.template DeQue<BiasT>();
                } else {
                    bias.SetAddr(var.inputBias_.address_);
                    bias = bias[var.curN_ * var.tiling_.GetBaseN()];
                }
                MatmulInstr::biasType_ = IsSameType<L0cT, typename BIAS_TYPE::T>::value ? 2 : 1; // 2:f32, 1:f16
                MatmulInstr::sL1BiasOffset_ = 0;
                MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                if constexpr (A_TYPE::layout == LayoutMode::NONE || ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
                    var.qidBias_.FreeTensor(bias);
                }
            } else {
                MatmulInstr::biasType_ = 0;
                if constexpr(ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                    if (intraBlockMatmul.fakeMsg) {
                        MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                    } else {
                        MatmulInstr::template Compute<false, false, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0);
                    }
                } else {
                    MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                }
            }
        } else {
            MatmulInstr::biasType_ = 0;
            MatmulInstr::template Compute<!ToMatmulConfig(MM_CFG).enableSetBias, true>(a1, b1,
             MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
        }
#elif __CCE_AICORE__ == 200
        if (var.enableBias_) {
            MatmulInstr::biasType_ = 0; // enable bias
            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor());
        } else {
            MatmulInstr::biasType_ = MatmulInstr::sL0cInit_;
            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor());
        }
#endif
        MATMUL_MODULE(ChosenCopyCubeInA)->ClearLoadData(a1);
        if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) {
            MATMUL_MODULE(ChosenCopyCubeInB)->ClearLoadData(b1);
        }
    } else { // not basic
        for (int k = 0; k < var.kIter_; k++) { // start reduce K axis
            if constexpr (NoTailK(MM_CFG)) {
                if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) {
                    var.baseUseK_ = (k + 1 == var.kIter_) ? var.tailK_ : var.tiling_.GetBaseK();
                } else {
                    var.baseUseK_ = var.tiling_.GetBaseK();
                }
            } else {
                var.baseUseK_ = (k + 1 == var.kIter_) ? var.tailK_ : var.tiling_.GetBaseK();
            }
            var.blockUseK_ = Ceil(var.baseUseK_, c0Size_);
            a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_, k, var.baseUseM_, var.baseUseK_);
            if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_);
            } else if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                if (intraBlockMatmul.fakeMsg) {
                    b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_);
                }
            }
            if constexpr (!IsStaticPaddingEnable(MM_CFG)) {
                // set addr
                MatmulInstr::sAL1K_ = var.blockUseK_ * c0Size_;
                MatmulInstr::sBL1K_ = var.blockUseK_ * c0Size_;
                MatmulInstr::sMadK_ = var.baseUseK_;
                MatmulInstr::sMad0K_ = var.baseUseK_; // split K value
            }
            if constexpr (PhyPosIsL1(A_TYPE::pos) || (A_TYPE::layout != LayoutMode::NONE &&
                ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) {
                MatmulInstr::sAL1MOffset_ = var.curM_ * var.tiling_.GetBaseM();
                MatmulInstr::sAL1KOffset_ = k * var.tiling_.GetBaseK();
                MatmulInstr::sAL1M_ = Ceil(var.singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE;
                if (var.isTransposeA_) {
                    MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
                } else {
                    MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
                }
            }
            if constexpr (PhyPosIsL1(B_TYPE::pos) || (B_TYPE::layout != LayoutMode::NONE &&
                ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) {
                MatmulInstr::sBL1NOffset_ = var.curN_ * var.tiling_.GetBaseN();
                MatmulInstr::sBL1KOffset_ = k * var.tiling_.GetBaseK();
                MatmulInstr::sBL1N_ = Ceil(var.singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE;
                if (var.isTransposeB_) {
                    MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
                } else {
                    MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
                }
            }
            // set flag
            // This flag needs to be set to 0 only when the outer axis is cut to K.
            // Currently, all K processed at a time.
            if (k == 0) {
                MatmulInstr::sL0cInit_ = enPartialSum ? 0 : 1;
            } else {
                MatmulInstr::sL0cInit_ = 0;
            }
#if __CCE_AICORE__ >= 220
            if constexpr (EnUnitFlag(MM_CFG)) {
                if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                    if (intraBlockMatmul.fakeMsg) {
                        if (k == var.kIter_ - 1) {
                            MatmulInstr::sL0cLast_ = 1;
                        } else {
                            MatmulInstr::sL0cLast_ = 0;
                        }
                    }
                } else {
                    if (k == var.kIter_ - 1) {
                        MatmulInstr::sL0cLast_ = 1;
                    } else {
                        MatmulInstr::sL0cLast_ = 0;
                    }
                }
            }
            if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
                if (k == 0 && var.enableBias_) {
                    if constexpr (A_TYPE::layout == LayoutMode::NONE || ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
                        // In multiple batch, the L1 cache is used to offset the memory inputBias_.
                        bias = var.qidBias_.template DeQue<BiasT>();
                    } else {
                        bias.SetAddr(var.inputBias_.address_);
                        bias = bias[var.curN_ * var.tiling_.GetBaseN()];
                    }
                    MatmulInstr::biasType_ = IsSameType<L0cT, typename BIAS_TYPE::T>::value ? 2 : 1; // 2:f32, 1:f16
                    MatmulInstr::sL1BiasOffset_ = 0;
                    if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                        uint32_t singlePosA = var.curM_ * var.kIter_;
                        uint32_t singlePosB = var.curN_ * var.kIter_;
                        MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, singlePosA, singlePosB);
                    } else {
                        MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                    }
                    if constexpr (A_TYPE::layout == LayoutMode::NONE || ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
                        var.qidBias_.FreeTensor(bias);
                    }
                } else {
                    MatmulInstr::biasType_ = 0;
                    if constexpr(ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                        if (intraBlockMatmul.fakeMsg) {
                            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                        } else {
                            int posB = (var.curN_ * var.kIter_ + k) % (var.tiling_.GetStepN() * var.kIter_);
                            MatmulInstr::template Compute<false, false, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias,
                                posB * var.tiling_.GetBaseK() * var.tiling_.GetBaseN(), 0);
                        }
                    } else {
                        if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                            uint32_t singlePosA = var.curM_ * var.kIter_ + k;
                            uint32_t singlePosB = var.curN_ * var.kIter_ + k;
                            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, singlePosA, singlePosB);
                        } else {
                            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                        }
                    }
                }
            } else {
                MatmulInstr::biasType_ = 0;
                if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                    uint32_t singlePosA = var.curM_ * var.kIter_ + k;
                    uint32_t singlePosB = var.curN_ * var.kIter_ + k;
                    MatmulInstr::template Compute<!ToMatmulConfig(MM_CFG).enableSetBias, true>(a1, b1,
                        MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, singlePosA, singlePosB);
                } else {
                    MatmulInstr::template Compute<!ToMatmulConfig(MM_CFG).enableSetBias, true>(a1, b1,
                        MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
                }
            }
#elif __CCE_AICORE__ == 200
            if (var.enableBias_) {
                MatmulInstr::biasType_ = 0; // enable bias
                MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor());
            } else {
                MatmulInstr::biasType_ = MatmulInstr::sL0cInit_;
                MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor());
            }
#endif
            MATMUL_MODULE(ChosenCopyCubeInA)->ClearLoadData(a1, var.curM_, k);
            if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) {
                MATMUL_MODULE(ChosenCopyCubeInB)->ClearLoadData(b1, k, var.curN_);
            }
        }
    }
}

// v220
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeBatchNormL0DB(int kInner)
{
    (void)kInner;
    MatmulInstr::useL0PingPong_ = (var.tiling_.GetDbL0A() - 1) & (var.tiling_.GetDbL0B() - 1);
    MatmulInstr::sAL1MOffset_ = var.curM_ * var.tiling_.GetBaseM();
    MatmulInstr::sAL1KOffset_ = 0;
    MatmulInstr::sAL1M_ = Ceil(var.singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE;
    if (var.isTransposeA_) {
        MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
    } else {
        MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
    }
    MatmulInstr::sBL1NOffset_ = var.curN_ * var.tiling_.GetBaseN();
    MatmulInstr::sBL1KOffset_ = 0;
    MatmulInstr::sBL1N_ = Ceil(var.singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE;
    if (var.isTransposeB_) {
        MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
    } else {
        MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
    }
    auto a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_, 0, var.baseUseM_, var.baseUseK_);
    auto b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, var.curN_, var.baseUseK_, var.baseUseN_);
    LocalTensor<BiasT> bias;
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.enableBias_) {
            bias.SetAddr(var.inputBias_.address_);
            bias = bias[var.curN_ * var.tiling_.GetBaseN()];
            MatmulInstr::biasType_ = IsSameType<L0cT, BiasT>::value ? 2 : 1; // 2:f32, 1:f16
            MatmulInstr::sL1BiasOffset_ = 0;
            MatmulInstr::template Compute<false, false, false, ToMatmulConfig(MM_CFG).scheduleType, ToMatmulConfig(MM_CFG).iterateOrder>(a1, b1,
             MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, var.sMadMStep_, var.sMadNStep_);

            if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
                var.qidBias_.FreeTensor(bias);
            }
        } else {
            MatmulInstr::biasType_ = 0;
            MatmulInstr::template Compute<false, false, false, ToMatmulConfig(MM_CFG).scheduleType, ToMatmulConfig(MM_CFG).iterateOrder>(a1, b1,
             MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, var.sMadMStep_, var.sMadNStep_);
        }
    } else {
        MatmulInstr::biasType_ = 0;
        MatmulInstr::template Compute<!ToMatmulConfig(MM_CFG).enableSetBias, true, false, ToMatmulConfig(MM_CFG).scheduleType, ToMatmulConfig(MM_CFG).iterateOrder>(a1, b1,
         MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, var.sMadMStep_, var.sMadNStep_);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNormWithMdb(int kInner)
{
    int dbLoop = (var.curM_ + 1 == var.mIter_) ? 1 : Impl::DOUBLE_SIZE;
    MatmulInstr::useL0PingPong_ = (dbLoop == Impl::DOUBLE_SIZE) ? 1 : 0;
    LocalTensor<BiasT> bias;
    bool isBiasEnable = false;
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (kInner == 0 && var.enableBias_) {
            isBiasEnable = true;
            bias = var.qidBias_.template DeQue<BiasT>();
            MatmulInstr::biasType_ = IsSameType<L0cT, BiasT>::value ? 2 : 1; // 2:f32, 1:f16
            MatmulInstr::sL1BiasOffset_ = 0;
        }
    }
    int dbUsedM = var.baseUseM_;
    auto b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(kInner, var.curN_, var.baseUseK_, var.baseUseN_);
    for (int dbInner = 0; dbInner < dbLoop; dbInner++) {
        if (dbInner > 0) {
            if (var.curM_ + Impl::DOUBLE_SIZE == var.mIter_) {
                // if tailM_ != baseM, reset sAL1M_ and sMadM_
                dbUsedM = var.tailM_;
                MatmulInstr::sAL1M_ = CeilAlign(dbUsedM, BLOCK_CUBE);
                MatmulInstr::sMadM_ = dbUsedM;
            }
        }
        auto a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_ + dbInner, kInner, dbUsedM, var.baseUseK_);
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
            if (!isBiasEnable) {
                MatmulInstr::biasType_ = 0;
            }
            if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                uint32_t singlePosA = (var.curM_ + dbInner) * var.kIter_;
                uint32_t singlePosB = var.curN_ * var.kIter_;
                if (!isBiasEnable) {
                    singlePosA += kInner;
                    singlePosB += kInner;
                }
                MatmulInstr::template Compute<false, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, singlePosA, singlePosB, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            } else {
                MatmulInstr::template Compute<false, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, 0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            }
        } else {
            MatmulInstr::biasType_ = 0;
            if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                uint32_t singlePosA = (var.curM_ + dbInner) * var.kIter_ + kInner;
                uint32_t singlePosB = var.curN_ * var.kIter_ + kInner;
                MatmulInstr::template Compute<true, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, singlePosA, singlePosB, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            } else {
                MatmulInstr::template Compute<true, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, 0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            }
        }
        MATMUL_MODULE(ChosenCopyCubeInA)->ClearLoadData(a1, var.curM_ + dbInner, kInner);
    }
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (isBiasEnable) {
            var.qidBias_.FreeTensor(bias);
        }
    }
    MATMUL_MODULE(ChosenCopyCubeInB)->ClearLoadData(b1, kInner, var.curN_);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNormWithNdb(int kInner)
{
    int dbLoop = (var.curN_ + 1 == var.nIter_) ? 1 : Impl::DOUBLE_SIZE;
    MatmulInstr::useL0PingPong_ = (dbLoop == Impl::DOUBLE_SIZE) ? 1 : 0;
    LocalTensor<BiasT> bias;
    bool isBiasEnable = false;
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (kInner == 0 && var.enableBias_) {
            isBiasEnable = true;
            bias = var.qidBias_.template DeQue<BiasT>();
            MatmulInstr::biasType_ = IsSameType<L0cT, BiasT>::value ? 2 : 1; // 2:f32, 1:f16
            MatmulInstr::sL1BiasOffset_ = 0;
        }
    }
    int dbUsedN = var.baseUseN_;
    auto a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_, kInner, var.baseUseM_, var.baseUseK_);
    for (int dbInner = 0; dbInner < dbLoop; dbInner++) {
        if (dbInner > 0) {
            if (var.curN_ + Impl::DOUBLE_SIZE == var.nIter_) {
                // if tailN_ != baseN, reset sBL1N_ and sMadN_
                dbUsedN = var.tailN_;
                MatmulInstr::sBL1N_ = CeilAlign(dbUsedN, BLOCK_CUBE);
                MatmulInstr::sMadN_ = dbUsedN;
            }
        }
        auto b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(kInner, var.curN_ + dbInner, var.baseUseK_, dbUsedN);
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
            if (!isBiasEnable) {
                MatmulInstr::biasType_ = 0;
            }
            if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                uint32_t singlePosA = var.curM_ * var.kIter_;
                uint32_t singlePosB = (var.curN_ + dbInner) * var.kIter_;
                if (!isBiasEnable) {
                    singlePosA += kInner;
                    singlePosB += kInner;
                }
                MatmulInstr::template Compute<false, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, singlePosA, singlePosB, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            } else {
                MatmulInstr::template Compute<false, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, 0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            }
        } else {
            MatmulInstr::biasType_ = 0;
            if constexpr (IsL0Cache<A_TYPE, MM_CFG>()) {
                uint32_t singlePosA = var.curM_ * var.kIter_ + kInner;
                uint32_t singlePosB = (var.curN_ + dbInner) * var.kIter_ + kInner;
                MatmulInstr::template Compute<true, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, singlePosA, singlePosB, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            } else {
                MatmulInstr::template Compute<true, true, false, ToMatmulConfig(MM_CFG).scheduleType,
                    ToMatmulConfig(MM_CFG).iterateOrder, true>(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(),
                    bias, 0, 0, var.sMadMStep_, var.sMadNStep_, 0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseN());
            }
        }
        MATMUL_MODULE(ChosenCopyCubeInB)->ClearLoadData(b1, kInner, var.curN_ + dbInner);
    }
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (isBiasEnable) {
            var.qidBias_.FreeTensor(bias);
        }
    }
    MATMUL_MODULE(ChosenCopyCubeInA)->ClearLoadData(a1, var.curM_, kInner);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNormL0DB(bool enPartialSum)
{
#if __CCE_AICORE__ == 220
    if constexpr (A_TYPE::layout != LayoutMode::NONE) {
        ASCENDC_ASSERT((var.singleCoreK_ <= var.tiling_.GetBaseK()) && (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1), {
            KERNEL_LOG(KERNEL_ERROR,
                "ComputeNormL0DB only support singleCoreK_ <= baseK, and BatchMode is not SINGLE_LARGE_THAN_L1.");
        });
    }
#endif
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.enableBias_) {
            LoadBias(MATMUL_MODULE(CubeOutBuffer)->GetTensor(), var.curN_);
        }
    }

    MatmulInstr::ssAmatrixTranspose_ = var.isTransposeA_;
    MatmulInstr::ssBmatrixTranspose_ = var.isTransposeB_;
    MatmulInstr::sAL1MOffset_ = 0;
    MatmulInstr::sBL1NOffset_ = 0;
    MatmulInstr::sAL1KOffset_ = 0;
    MatmulInstr::sBL1KOffset_ = 0;
    for (int k = 0; k < var.kIter_; k++) {
        var.baseUseK_ = (k + 1 == var.kIter_) ? var.tailK_ : var.tiling_.GetBaseK();
        var.blockUseK_ = Ceil(var.baseUseK_, c0Size_);
        // set addr
        // reset sAL1M_/sBL1N_/sMadM_/sMadN_ at each k
        MatmulInstr::sAL1M_ = var.blockUseM_ * BLOCK_CUBE;
        MatmulInstr::sBL1N_ = var.blockUseN_ * BLOCK_CUBE;
        MatmulInstr::sMadM_ = var.baseUseM_;
        MatmulInstr::sMadN_ = var.baseUseN_;
        MatmulInstr::sAL1K_ = var.blockUseK_ * c0Size_;
        MatmulInstr::sBL1K_ = var.blockUseK_ * c0Size_;
        MatmulInstr::sMadK_ = var.baseUseK_;
        MatmulInstr::sMad0K_ = var.baseUseK_; // split K value
        // set flag
        // This flag needs to be set to 0 only when the outer axis is cut to K.
        // Currently, all K processed at a time.
        if (k == 0) {
            MatmulInstr::sL0cInit_ = enPartialSum ? 0 : 1;
        } else {
            MatmulInstr::sL0cInit_ = 0;
        }

        if constexpr (EnUnitFlag(MM_CFG)) {
            if (k == var.kIter_ - 1) {
                MatmulInstr::sL0cLast_ = 1;
            } else {
                MatmulInstr::sL0cLast_ = 0;
            }
        }
        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
            ComputeBatchNormL0DB(k);
        } else if constexpr (ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
            ComputeNormWithNdb(k);
        } else if constexpr (ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_N) {
            ComputeNormWithMdb(k);
        } else {
            if (var.tiling_.GetIterateOrder() == static_cast<int>(IterateOrder::ORDER_M)) {
                ComputeNormWithNdb(k);
            } else {
                ComputeNormWithMdb(k);
            }
        }
    }
}

// v220 v200 v300
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeIBShareNorm(
    bool enPartialSum)
{
    if (var.enableBias_) {
        LoadBias(MATMUL_MODULE(CubeOutBuffer)->GetTensor(), var.curN_);
    }

    MatmulInstr::sAL1M_ = var.blockUseM_ * BLOCK_CUBE;
    MatmulInstr::sBL1N_ = var.blockUseN_ * BLOCK_CUBE;
    MatmulInstr::sMadM_ = var.baseUseM_;
    MatmulInstr::sMadN_ = var.baseUseN_;
    MatmulInstr::ssAmatrixTranspose_ = var.isTransposeA_;
    MatmulInstr::ssBmatrixTranspose_ = var.isTransposeB_;
    MatmulInstr::useL0PingPong_ = (var.tiling_.GetDbL0A() - 1) & (var.tiling_.GetDbL0B() - 1);
    LocalTensor<BiasT> bias;
    for (int k = 0; k < var.kIter_; k++) { // start reduce K axis
        var.baseUseK_ = (k + 1 == var.kIter_) ? var.tailK_ : var.tiling_.GetBaseK();
        var.blockUseK_ = Ceil(var.baseUseK_, c0Size_);
        auto a1 = MATMUL_MODULE(CopyCubeInA)->LoadData(var.curM_, k, var.baseUseM_, var.baseUseK_);
        auto b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_);
        // set addr
        MatmulInstr::sAL1K_ = var.blockUseK_ * c0Size_;
        MatmulInstr::sBL1K_ = var.blockUseK_ * c0Size_;
        MatmulInstr::sMadK_ = var.baseUseK_;
        MatmulInstr::sAL1MOffset_ = 0;
        MatmulInstr::sAL1KOffset_ = 0;
        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
            MatmulInstr::sAL1MOffset_ = var.curM_ * var.tiling_.GetBaseM();
            MatmulInstr::sAL1KOffset_ = k * var.tiling_.GetBaseK();
            MatmulInstr::sAL1M_ = Ceil(var.singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE;
            if (var.isTransposeA_) {
                MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
            } else {
                MatmulInstr::sAL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
            }
        }
        MatmulInstr::sBL1NOffset_ = 0;
        MatmulInstr::sBL1KOffset_ = 0;
        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
            MatmulInstr::sBL1NOffset_ = var.curN_ * var.tiling_.GetBaseN();
            MatmulInstr::sBL1KOffset_ = k * var.tiling_.GetBaseK();
            MatmulInstr::sBL1N_ = Ceil(var.singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE;
            if (var.isTransposeB_) {
                MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
            } else {
                MatmulInstr::sBL1K_ = Ceil(var.singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE;
            }
        }
        MatmulInstr::sMad0K_ = var.baseUseK_; // split K value
        // set flag
        // This flag needs to be set to 0 only when the outer axis is cut to K.
        // Currently, all K processed at a time.
        if (k == 0) {
            MatmulInstr::sL0cInit_ = enPartialSum ? 0 : 1;
        } else {
            MatmulInstr::sL0cInit_ = 0;
        }
        if constexpr (EnUnitFlag(MM_CFG)) {
            if (k == var.kIter_ - 1) {
                MatmulInstr::sL0cLast_ = 1;
            } else {
                MatmulInstr::sL0cLast_ = 0;
            }
        }

        if (k == 0 && var.enableBias_) {
            bias = var.qidBias_.template DeQue<BiasT>();
            MatmulInstr::biasType_ = IsSameType<L0cT, typename BIAS_TYPE::T>::value ? 2 : 1; // 2:f32, 1:f16
            MatmulInstr::sL1BiasOffset_ = 0;
            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
            var.qidBias_.FreeTensor(bias);

        } else {
            MatmulInstr::biasType_ = 0;
            MatmulInstr::Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias);
        }
        MATMUL_MODULE(CopyCubeInA)->ClearLoadData(a1, var.curM_, k);
        MATMUL_MODULE(CopyCubeInB)->ClearLoadData(b1, k, var.curN_);
    }
}

#else

// v100
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNorm(bool enPartialSum)
{
    ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul version."); });
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeIBShareNorm(
    bool enPartialSum)
{
    ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul version."); });
}

#endif

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline bool MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::Iterate(bool enPartialSum)
{
    if constexpr (DoMatmulNorm(MM_CFG)) {
        if constexpr (isNormEnableScheduler<A_TYPE, MM_CFG>) {
            return IterateNormScheduler(enPartialSum);
        } else {
            return IterateNorm(enPartialSum);
        }
    } else if constexpr (IsBasicBlockEnable<MM_CFG> && !Impl::Detail::MatmulFeatureTrait<MM_CFG>::IsNeedUB()) {
        return MATMUL_MODULE(Scheduler)->ScheduleOnce(enPartialSum);
    } else if constexpr (DoMatmulMDL(MM_CFG)) {
        return MATMUL_MODULE(Scheduler)->ScheduleOnce(enPartialSum);;
    } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) {
        return IterateIBShareNorm(enPartialSum);
    } else if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
        return IterateSpecialMDL(enPartialSum);
    } else {
        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul version."); });
        return false;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline bool MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateNorm(bool enPartialSum)
{
    if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
        return IterateNormL0DB(enPartialSum);
    }

    if (!IterateController::MoveNext()) {
        return false;
    }

    // Initializing variables
    if constexpr (NoTailM(MM_CFG)) {
        var.baseUseM_ = var.tiling_.GetBaseM();
    } else if constexpr (IsBasicM(MM_CFG)) {
        var.baseUseM_ = var.tailM_;
    } else {
        var.baseUseM_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM();
    }

    if constexpr (NoTailN(MM_CFG)) {
        var.baseUseN_ = var.tiling_.GetBaseN();
    } else if constexpr (IsBasicN(MM_CFG)) {
        var.baseUseN_ = var.tailN_;
    } else {
        var.baseUseN_ = (var.curN_ + 1 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN();
    }
    var.blockUseM_ = Ceil(var.baseUseM_, BLOCK_CUBE);
    var.blockUseN_ = Ceil(var.baseUseN_, BLOCK_CUBE);
    LoadC(enPartialSum); // get one C address
    Compute(enPartialSum);

    DEBUG_CODE(var.calCount_++);
    return true;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline bool MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateNormScheduler(bool enPartialSum)
{
    return MATMUL_MODULE(Scheduler)->ScheduleOnce(enPartialSum);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline bool MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateNormL0DB(
    bool enPartialSum)
{
    ASCENDC_ASSERT((ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT), {
        KERNEL_LOG(KERNEL_ERROR, "IterateNormL0DB only support scheduleType == OUTER_PRODUCT");
    });
    if (unlikely(var.isFirstIter_)) {
        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
            auto normMove = IterateController::MoveNext();
        } else {
            var.isFirstIter_ = false;
            var.curM_ = 0;
            var.curN_ = 0;
        }
        if constexpr (ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
            if ((var.tailN_ == var.tiling_.GetBaseN() && var.nIter_ % 2 == 0) || (var.curN_ < var.nIter_ - 2)) {
                var.sMadNStep_ = 2 * var.tiling_.GetBaseN();
            } else {
                if constexpr (A_TYPE::layout == LayoutMode::NONE) {
                    if (var.nIter_ % 2 == 0) {
                        var.sMadNStep_ = 2 * var.tiling_.GetBaseN();
                    } else {
                        var.sMadNStep_ = (var.curN_ + 1 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN();
                    }
                } else {
                    var.sMadNStep_ = (var.curN_ + 1 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN();
                }
            }
        } else {
            ASCENDC_ASSERT((var.tiling_.GetIterateOrder() == static_cast<int>(IterateOrder::ORDER_N)), {
                KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N", var.tiling_.GetIterateOrder());
            });
            if ((var.tailM_ == var.tiling_.GetBaseM() && var.mIter_ % 2 == 0) || (var.curM_ < var.mIter_ - 2)) {
                var.sMadMStep_ = 2 * var.tiling_.GetBaseM();
            } else {
                if constexpr (A_TYPE::layout == LayoutMode::NONE) {
                    if (var.mIter_ % 2 == 0) {
                        var.sMadMStep_ = 2 * var.tiling_.GetBaseM();
                    } else {
                        var.sMadMStep_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM();
                    }
                } else {
                    var.sMadMStep_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM();
                }
            }
        }
    } else if (likely(var.tiling_.GetIterateOrder() == static_cast<int>(IterateOrder::ORDER_M))) { // Output along M axis
        if ((var.tailN_ == var.tiling_.GetBaseN() && var.nIter_ % 2 == 0) || (var.curN_ < var.nIter_ - 2)) {
            var.curN_ = var.curN_ + 2;
        } else {
            if constexpr (A_TYPE::layout == LayoutMode::NONE) {
                if (var.nIter_ % 2 == 0) {
                    var.curN_ = var.curN_ + 2;
                } else {
                    var.curN_ = var.curN_ + 1;
                }
            } else {
                var.curN_ = var.curN_ + 1;
            }
        }
        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
            if (var.curN_ >= var.stepNIdx_ + var.curStepN_) {
                MATMUL_MODULE(CopyCubeInA)->Reset();
                var.curN_ = var.stepNIdx_;
                if (++var.curM_ >= var.mIter_) {
                    MATMUL_MODULE(CopyCubeInB)->Reset();
                    var.curM_ = 0;
                    var.stepNIdx_ += var.curStepN_;
                    if (var.stepNIdx_ >= var.nIter_) {
                        return false;
                    }
                    var.curN_ = var.stepNIdx_;
                    var.curStepN_ =
                        (var.nIter_ - var.curN_) > var.tiling_.GetStepN() ?
                        var.tiling_.GetStepN() : (var.nIter_ - var.curN_);
                }
            }
        } else {
            if (var.curN_ >= var.nIter_) {
                if (++var.curM_ >= var.mIter_) {
                    var.curM_ = 0;
                    if (var.curN_ >= var.nIter_) {
                        return false;
                    }
                }
                var.curN_ = 0;
            }
        }
        if ((var.tailN_ == var.tiling_.GetBaseN() && var.nIter_ % 2 == 0) || (var.curN_ < var.nIter_ - 2)) {
            var.sMadNStep_ = 2 * var.tiling_.GetBaseN();
        } else {
            if constexpr (A_TYPE::layout == LayoutMode::NONE) {
                if (var.nIter_ % 2 == 0) {
                    var.sMadNStep_ = 2 * var.tiling_.GetBaseN();
                } else {
                    var.sMadNStep_ = (var.curN_ + 1 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN();
                }
            } else {
                var.sMadNStep_ = (var.curN_ + 1 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN();
            }
        }
    } else {
        ASCENDC_ASSERT((var.tiling_.GetIterateOrder() == static_cast<int>(IterateOrder::ORDER_N)), {
            KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N", var.tiling_.GetIterateOrder());
        });
        if ((var.tailM_ == var.tiling_.GetBaseM() && var.mIter_ % 2 == 0) || (var.curM_ < var.mIter_ - 2)) {
            var.curM_ = var.curM_ + 2;
        } else {
            if constexpr (A_TYPE::layout == LayoutMode::NONE) {
                if (var.mIter_ % 2 == 0) {
                    var.curM_ = var.curM_ + 2;
                } else {
                    var.curM_ = var.curM_ + 1;
                }
            } else {
                var.curM_ = var.curM_ + 1;
            }
        }
        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
            if (var.curM_ >= var.stepMIdx_ + var.curStepM_) {
                MATMUL_MODULE(CopyCubeInB)->Reset();
                var.curM_ = var.stepMIdx_;
                if (++var.curN_ >= var.nIter_) {
                    MATMUL_MODULE(CopyCubeInA)->Reset();
                    var.curN_ = 0;
                    var.stepMIdx_ += var.curStepM_;
                    if (var.stepMIdx_ >= var.mIter_) {
                        return false;
                    }
                    var.curM_ = var.stepMIdx_;
                    var.curStepM_ =
                        (var.mIter_ - var.curM_) > var.tiling_.GetStepM() ?
                        var.tiling_.GetStepM() : (var.mIter_ - var.curM_);
                }
            }
        } else {
            if (var.curM_ >= var.mIter_) {
                if (++var.curN_ >= var.nIter_) {
                    var.curN_ = 0;
                    if (var.curM_ >= var.mIter_) {
                        return false;
                    }
                }
                var.curM_ = 0;
            }
        }
        if ((var.tailM_ == var.tiling_.GetBaseM() && var.mIter_ % 2 == 0) || (var.curM_ < var.mIter_ - 2)) {
            var.sMadMStep_ = 2 * var.tiling_.GetBaseM();
        } else {
            if constexpr (A_TYPE::layout == LayoutMode::NONE) {
                if (var.mIter_ % 2 == 0) {
                    var.sMadMStep_ = 2 * var.tiling_.GetBaseM();
                } else {
                    var.sMadMStep_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM();
                }
            } else {
                var.sMadMStep_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM();
            }
        }
    }
    // Initializing variables
    var.baseUseM_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_.GetBaseM();
    var.baseUseN_ = (var.curN_ + 1 == var.nIter_) ? var.tailN_ : var.tiling_.GetBaseN();
    var.blockUseM_ = Ceil(var.baseUseM_, BLOCK_CUBE);
    var.blockUseN_ = Ceil(var.baseUseN_, BLOCK_CUBE);

    LoadC(enPartialSum); // get one C address
    ComputeNormL0DB(enPartialSum);

    DEBUG_CODE(var.calCount_++);
    return true;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline bool MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateIBShareNorm(
    bool enPartialSum)
{
    return MATMUL_MODULE(Scheduler)->ScheduleOnce(enPartialSum);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline bool MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateSpecialMDL(
    bool enPartialSum)
{
    return MATMUL_MODULE(Scheduler)->ScheduleOnce(enPartialSum);
}

#if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 200 || __CCE_AICORE__ == 300
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetBatchIterateAOffset(
    const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    int32_t tmpBatchIdx = batchIdx + splitOuterIdx * batchNum / splitSize;
    if (var.tiling_.GetALayoutInfoG() == 1 && var.tiling_.GetBLayoutInfoG() != 1) { // BRC for G axis
        ASSERT(var.tiling_.GetBLayoutInfoG() > 0);
        ASSERT(var.tiling_.GetALayoutInfoN() == var.tiling_.GetBLayoutInfoN());
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        tmpBatchIdx = tmpBatchIdx / var.tiling_.GetBLayoutInfoG();
    } else if (var.tiling_.GetALayoutInfoN() == 1 && var.tiling_.GetBLayoutInfoN() != 1) {
        // BRC for N axis = idx % BLayoutInfoG + idx / (BLayoutInfoG * BLayoutInfoN)
        ASSERT(var.tiling_.GetBLayoutInfoN() > 0);
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG());
        tmpBatchIdx = tmpBatchIdx % var.tiling_.GetBLayoutInfoG() + tmpBatchIdx /
            (var.tiling_.GetBLayoutInfoG() * var.tiling_.GetBLayoutInfoN());
    } else if (var.tiling_.GetALayoutInfoB() == 1 && var.tiling_.GetBLayoutInfoB() != 1 && A_TYPE::layout !=
        LayoutMode::NORMAL) { // BRC for B axis
        ASSERT(var.tiling_.GetBLayoutInfoB() > 0);
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG()); // multi axis BRC is not supported.
        tmpBatchIdx = tmpBatchIdx % (var.tiling_.GetBLayoutInfoG() * var.tiling_.GetBLayoutInfoN()) + tmpBatchIdx /
            (var.tiling_.GetBLayoutInfoG() * var.tiling_.GetBLayoutInfoN() * var.tiling_.GetBLayoutInfoB());
    }
    if constexpr (A_TYPE::layout == LayoutMode::NORMAL) {
        tmpBatchIdx = tmpBatchIdx / (batchNum / batchA_);
    }
    if (var.isTransposeA_) {
        int32_t alignM = Ceil(var.singleCoreM_, c0Size_) * c0Size_;
        int32_t alignSize = BLOCK_CUBE;
        if constexpr (IsSameType<SrcT, int8_t>::value) {
            alignSize = c0Size_;
        }
        int32_t alignK = Ceil(var.singleCoreK_, alignSize) * alignSize;
        return alignM * alignK * tmpBatchIdx;
    } else {
        int32_t alignM = Ceil(var.singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE;
        int32_t alignK = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
        return alignM * alignK * tmpBatchIdx;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetBatchIterateBOffset(
    const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    int32_t tmpBatchIdx = batchIdx + splitOuterIdx * batchNum / splitSize;
    if (var.tiling_.GetBLayoutInfoG() == 1 && var.tiling_.GetALayoutInfoG() != 1) { // BRC for G axis
        ASSERT(var.tiling_.GetALayoutInfoG() > 0);
        ASSERT(var.tiling_.GetALayoutInfoN() == var.tiling_.GetBLayoutInfoN());
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        tmpBatchIdx = tmpBatchIdx / var.tiling_.GetALayoutInfoG();
    } else if (var.tiling_.GetBLayoutInfoN() == 1 && var.tiling_.GetALayoutInfoN() != 1) {
        // BRC for GN axis = idx % BLayoutInfoG + idx / (BLayoutInfoG * BLayoutInfoN)
        ASSERT(var.tiling_.GetALayoutInfoN() > 0);
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG());
        tmpBatchIdx = tmpBatchIdx % var.tiling_.GetALayoutInfoG() + tmpBatchIdx /
            (var.tiling_.GetALayoutInfoG() * var.tiling_.GetALayoutInfoN());
    } else if (var.tiling_.GetBLayoutInfoB() == 1 && var.tiling_.GetALayoutInfoB() != 1) { // BRC for B axis
        ASSERT(var.tiling_.GetALayoutInfoB() > 0);
        ASSERT(var.tiling_.GetALayoutInfoN() == var.tiling_.GetBLayoutInfoN());
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG()); // multi axis BRC is not supported.
        tmpBatchIdx = tmpBatchIdx % (var.tiling_.GetALayoutInfoG() * var.tiling_.GetALayoutInfoN()) +
            tmpBatchIdx / (var.tiling_.GetALayoutInfoG() * var.tiling_.GetALayoutInfoN() * var.tiling_.GetALayoutInfoB());
    }
    if constexpr (A_TYPE::layout == LayoutMode::NORMAL) {
        tmpBatchIdx = tmpBatchIdx / (batchNum / batchB_);
    }
    if (var.isTransposeB_) {
        int32_t alignN = Ceil(var.singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE;
        int32_t alignK = Ceil(var.singleCoreK_, c0Size_) * c0Size_;
        return alignN * alignK * tmpBatchIdx;
    } else {
        constexpr int32_t alignSize = IsSameType<SrcT, int8_t>::value ? c0Size_ : BLOCK_CUBE;
        int32_t alignN = Ceil(var.singleCoreN_, c0Size_) * c0Size_;
        int32_t alignK = Ceil(var.singleCoreK_, alignSize) * alignSize;
        return alignN * alignK * tmpBatchIdx;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
          MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t
MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetBatchIterateBiasOffset(
    const int32_t batchNum, const int32_t batchIdx, bool& enableBiase, const int32_t splitOuterIdx,
    const int32_t splitSize)
{
    int32_t tmpBatchIdx = batchIdx + splitOuterIdx * batchNum / splitSize;
    if (var.tiling_.GetCLayoutInfoG() == 1 &&
        (var.tiling_.GetALayoutInfoG() != 1 || var.tiling_.GetBLayoutInfoG() != 1)) {
        // Reduce for G axis
        ASSERT(var.tiling_.GetALayoutInfoG() > 0 && var.tiling_.GetBLayoutInfoG() > 0);
        ASSERT(var.tiling_.GetCLayoutInfoN() != 1 ||
               (var.tiling_.GetALayoutInfoN() == 1 && var.tiling_.GetBLayoutInfoN() == 1));
        // multi axis BRC is not supported.
        ASSERT(var.tiling_.GetCLayoutInfoB() != 1 ||
               (var.tiling_.GetALayoutInfoB() == 1 && var.tiling_.GetBLayoutInfoB() == 1));
        auto gExtend =
            var.tiling_.GetALayoutInfoG() != 1 ? var.tiling_.GetALayoutInfoG() : var.tiling_.GetBLayoutInfoG();
        if (tmpBatchIdx % gExtend != 0) {
            enableBiase = false;
        }
        tmpBatchIdx = tmpBatchIdx / gExtend;
    } else if (var.tiling_.GetCLayoutInfoN() == 1 &&
               (var.tiling_.GetALayoutInfoN() != 1 || var.tiling_.GetBLayoutInfoN() != 1)) {
        // Reduce for N axis
        ASSERT(var.tiling_.GetALayoutInfoN() > 0 && var.tiling_.GetBLayoutInfoN() > 0);
        ASSERT(var.tiling_.GetCLayoutInfoB() != 1 ||
               (var.tiling_.GetALayoutInfoB() == 1 && var.tiling_.GetBLayoutInfoB() == 1));
        ASSERT(var.tiling_.GetCLayoutInfoG() != 1 ||
               (var.tiling_.GetALayoutInfoG() == 1 && var.tiling_.GetBLayoutInfoG() == 1));
        auto gExtend =
            var.tiling_.GetALayoutInfoG() != 1 ? var.tiling_.GetALayoutInfoG() : var.tiling_.GetBLayoutInfoG();
        auto nExtend =
            var.tiling_.GetALayoutInfoN() != 1 ? var.tiling_.GetALayoutInfoN() : var.tiling_.GetBLayoutInfoN();
        tmpBatchIdx = tmpBatchIdx % gExtend + tmpBatchIdx / (gExtend * nExtend);
    } else if (var.tiling_.GetCLayoutInfoB() == 1 &&
               (var.tiling_.GetALayoutInfoB() != 1 || var.tiling_.GetBLayoutInfoB() != 1)) {
        // Reduce for B axis
        ASSERT(var.tiling_.GetALayoutInfoB() > 0 && var.tiling_.GetBLayoutInfoB() > 0);
        ASSERT(var.tiling_.GetCLayoutInfoN() != 1 ||
               (var.tiling_.GetALayoutInfoN() == 1 && var.tiling_.GetBLayoutInfoN() == 1));
        ASSERT(var.tiling_.GetCLayoutInfoG() != 1 ||
               (var.tiling_.GetALayoutInfoG() == 1 && var.tiling_.GetBLayoutInfoG() == 1));
        auto gExtend =
            var.tiling_.GetALayoutInfoG() != 1 ? var.tiling_.GetALayoutInfoG() : var.tiling_.GetBLayoutInfoG();
        auto nExtend =
            var.tiling_.GetALayoutInfoN() != 1 ? var.tiling_.GetALayoutInfoN() : var.tiling_.GetBLayoutInfoN();
        auto bExtend =
            var.tiling_.GetALayoutInfoB() != 1 ? var.tiling_.GetALayoutInfoB() : var.tiling_.GetBLayoutInfoB();
        tmpBatchIdx = tmpBatchIdx % (gExtend * nExtend) + tmpBatchIdx / (gExtend * nExtend * bExtend);
    }
    if constexpr (!ToMatmulConfig(MM_CFG).isBiasBatch) {
        return 0;
    }
    return CeilAlignNum(var.singleCoreN_, AscendCUtils::GetC0Count(sizeof(BiasT))) * tmpBatchIdx;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::UpdateBatchIterateInfo(
    const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    // Support BRC on the BNG axis of the AB matrix.
#ifdef ASCENDC_CPU_DEBUG
    int aMatrixSingleBatchSize;
    int bMatrixSingleBatchSize;
    if (var.isTransposeA_) {
        if constexpr (IsSameType<SrcT, int8_t>::value) {
            aMatrixSingleBatchSize = Ceil(var.tiling_.GetSingleCoreM(), c0Size_) * c0Size_ * \
                Ceil(var.tiling_.GetSingleCoreK(), c0Size_) * c0Size_ * sizeof(SrcT);
        } else {
            aMatrixSingleBatchSize = Ceil(var.tiling_.GetSingleCoreM(), c0Size_) * c0Size_ * \
                Ceil(var.tiling_.GetSingleCoreK(), BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcT);
        }
    } else {
        aMatrixSingleBatchSize = Ceil(var.tiling_.GetSingleCoreM(), BLOCK_CUBE) * BLOCK_CUBE * \
            Ceil(var.tiling_.GetSingleCoreK(), c0Size_) * c0Size_ * sizeof(SrcT);
    }

    if (var.isTransposeB_) {
        bMatrixSingleBatchSize = Ceil(var.tiling_.GetSingleCoreK(), c0Size_) * c0Size_ * \
            Ceil(var.tiling_.GetSingleCoreN(), BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcT);
    } else {
        if constexpr (IsSameType<SrcT, int8_t>::value) {
            bMatrixSingleBatchSize = Ceil(var.tiling_.GetSingleCoreK(), c0Size_) * c0Size_ * \
                Ceil(var.tiling_.GetSingleCoreN(), c0Size_) * c0Size_ * sizeof(SrcT);
        } else {
            bMatrixSingleBatchSize = Ceil(var.tiling_.GetSingleCoreK(), BLOCK_CUBE) * BLOCK_CUBE * \
                Ceil(var.tiling_.GetSingleCoreN(), c0Size_) * c0Size_ * sizeof(SrcT);
        }
    }
    var.leftMatrix_.address_.dataLen = aMatrixSingleBatchSize;
    var.rightMatrix_.address_.dataLen = bMatrixSingleBatchSize;
#endif
#if __CCE_AICORE__ == 200
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.enableBias_) {
            if (batchIdx != 0 || splitOuterIdx != 0) {
                var.biasGlobal_ += var.singleCoreN_;
            }
        }
    }
    if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
        if (batchIdx != 0 || splitOuterIdx != 0) {
            MatmulQuantProcessor::UpdateQuantTensor(var.singleCoreN_);
        }
    }
#else
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.enableBias_) {
            int32_t offsetBias =
                GetBatchIterateBiasOffset(batchNum, batchIdx, var.enableBias_, splitOuterIdx, splitSize);
            var.inputBias_.address_ = var.cacheHeadBias_[offsetBias].address_;
        }
    }
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        MatmulQuantProcessor::UpdateQuantTensor(var.singleCoreN_);
    }
#endif
    var.isFirstIter_ = true;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCForBatch(
    const GlobalTensor<DstT> &cGlobal, const int32_t iBatchIn, uint8_t enAtomic, bool enSequentialWriteIn)
{
    // supports continuous, discontinuous and reduce transfer on the GM. (three layout types are supported)
    uint64_t offset = 0;
    uint32_t nGapOffset = 0;
    uint32_t mGapOffset = 0;
    uint32_t reduceGNum = 0;
    bool isReduceG =
        ((var.tiling_.GetCLayoutInfoG() == 1) && (var.tiling_.GetALayoutInfoG() != 1 || var.tiling_.GetBLayoutInfoG() != 1));
    if (isReduceG) {
        reduceGNum = var.tiling_.GetALayoutInfoG() >= var.tiling_.GetBLayoutInfoG() ? var.tiling_.GetALayoutInfoG()
                                                                            : var.tiling_.GetBLayoutInfoG();
    } else {
        reduceGNum = var.tiling_.GetCLayoutInfoG();
    }
    uint32_t iBatch = isReduceG ? (iBatchIn / reduceGNum) : iBatchIn;
    if (isReduceG) {
        SetAtomicAdd<DstT>();
    }
    if ((C_TYPE::layout == LayoutMode::BSNGD) || (C_TYPE::layout == LayoutMode::SBNGD)) {
        ASSERT(enSequentialWriteIn == false && "Layout BSNGD or SBNGD can not be SequentialWrite");
    }
    // Scenario 1: Continuous copy
    if constexpr (C_TYPE::layout == LayoutMode::BNGS1S2 || C_TYPE::layout == LayoutMode::NORMAL) {
        int32_t alignedSingleCoreN = Ceil(var.tiling_.GetSingleCoreN(), AscendCUtils::GetC0Count(sizeof(DstT))) *
            AscendCUtils::GetC0Count(sizeof(DstT));
        if constexpr (PhyPosIsGM(C_TYPE::pos)) {
            alignedSingleCoreN = var.tiling_.GetSingleCoreN();
        }
        if constexpr (C_TYPE::format == CubeFormat::NZ) {
            alignedSingleCoreN = Ceil(var.tiling_.GetSingleCoreN(), BLOCK_CUBE) * BLOCK_CUBE;
        }
        offset = iBatch * var.tiling_.GetSingleCoreM() * alignedSingleCoreN;
        GetTensorCImpl(cGlobal[offset], enAtomic, enSequentialWriteIn);
    } else {
        // Scenario 2: disconsecutive copy
        if constexpr (!(C_TYPE::layout == LayoutMode::BSNGD || C_TYPE::layout == LayoutMode::SBNGD)) {  // BSNGD SBNGD
            ASSERT(false && "Can not support other Layout");
        }
        offset = iBatch * var.tiling_.GetSingleCoreN();
        GetTensorCByLayout(cGlobal[offset], enAtomic, enSequentialWriteIn, 0, 0);
    }
    if (isReduceG) {
        SetAtomicNone();
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCForBatch(
    const LocalTensor<DstT> &dst, const int32_t iBatchIn, uint8_t enAtomic, bool enSequentialWriteIn)
{
    int32_t alignedSingleCoreN = Ceil(var.tiling_.GetSingleCoreN(), AscendCUtils::GetC0Count(sizeof(DstT))) *
        AscendCUtils::GetC0Count(sizeof(DstT));
    uint64_t offset = iBatchIn * var.tiling_.GetSingleCoreM() * alignedSingleCoreN;
    GetTensorCImpl(dst[offset], enAtomic, enSequentialWriteIn);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeBatch(
    const GlobalTensor<DstT>& gm, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite,
    const uint32_t matrixStrideA, const uint32_t matrixStrideB, const int32_t batchOuterIdx)
{
#if __CCE_AICORE__ == 200
    GlobalTensor<uint64_t> global;
    global.SetGlobalBuffer((__gm__ uint64_t*)0);
    DataCacheCleanAndInvalid<uint64_t, CacheLine::ENTIRE_DATA_CACHE>(global);
#endif
    // Check that the total amount of data to be transferred is less than L1.
    ASSERT((batchA_ * var.tiling_.GetSingleCoreM() * var.tiling_.GetSingleCoreK() + batchB_ * var.tiling_.GetSingleCoreN() *
        var.tiling_.GetSingleCoreK()) * sizeof(SrcT) <= TOTAL_L1_SIZE);
    if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) {
        int32_t batchNum = batchA_ > batchB_ ? batchA_ : batchB_;
        int32_t splitSize = (batchNum >= 2) && (batchA_ % 2 == 0) && (batchB_ % 2 == 0)? 2 : 1;
        int32_t splitBatchNum = batchNum / splitSize;
        auto matrixA = MATMUL_MODULE(BatchCopyCubeInA)->AllocTensor();
        auto matrixB = MATMUL_MODULE(BatchCopyCubeInB)->AllocTensor();
#if __CCE_AICORE__ >= 220
        // Transfer the batchNum Bias matrix to L1 at one time.
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias &&
            (ToMatmulConfig(MM_CFG).batchMode != BatchMode::BATCH_LARGE_THAN_L1 ||
            ToMatmulConfig(MM_CFG).isBiasBatch)) {
            LoadBatchBiasToL1(batchOuterIdx);
        }
#endif
        event_t eventIDMte2ToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1));
        event_t eventIDMToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::M_MTE1));
        for (int32_t outer = 0; outer < splitSize; ++outer) {
            MATMUL_MODULE(BatchCopyCubeInA)->BatchLoad(matrixA, matrixStrideA, batchOuterIdx, outer, splitSize);
            MATMUL_MODULE(BatchCopyCubeInB)->BatchLoad(matrixB, matrixStrideB, batchOuterIdx, outer, splitSize);
            SetFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            WaitFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            ASSERT(batchA_ > 0 && batchB_ > 0 && (batchA_ % batchB_ == 0 || batchB_ % batchA_ == 0));
            for (int32_t iBatch = 0; (iBatch < splitBatchNum) && (outer * splitBatchNum < batchNum); ++iBatch) {
                // Set the start address on L1 for each batch calculation.
                // SetTensorA()/SetTensorB()/SetBias()/SetTail()/SetQuantVector()
                if constexpr (ToMatmulConfig(MM_CFG).singleCoreM != 0 && ToMatmulConfig(MM_CFG).singleCoreN != 0 &&
                    ToMatmulConfig(MM_CFG).singleCoreK != 0) {
                    int32_t offsetA = GetBatchIterateAOffsetConstant(batchNum, iBatch, outer, splitSize);
                    var.leftMatrix_.address_ = matrixA[offsetA].address_;
                    int32_t offsetB = GetBatchIterateBOffsetConstant(batchNum, iBatch, outer, splitSize);
                    var.rightMatrix_.address_ = matrixB[offsetB].address_;
                    UpdateBatchIterateInfoConstant(batchNum, iBatch, outer, splitSize);
                } else {
                    int32_t offsetA = GetBatchIterateAOffset(batchNum, iBatch, outer, splitSize);
                    var.leftMatrix_.address_ = matrixA[offsetA].address_;
                    int32_t offsetB = GetBatchIterateBOffset(batchNum, iBatch, outer, splitSize);
                    var.rightMatrix_.address_ = matrixB[offsetB].address_;
                    UpdateBatchIterateInfo(batchNum, iBatch, outer, splitSize);
                }
                while (Iterate(enPartialSum)) {
                    // GetensorC
                    GetTensorCForBatch(gm, iBatch + outer * splitBatchNum, enAtomic, enSequentialWrite);
                    SetFlag<HardEvent::M_MTE1>(eventIDMToMte1);
                    WaitFlag<HardEvent::M_MTE1>(eventIDMToMte1);
#if __CCE_AICORE__ == 200
                    if constexpr (ToMatmulConfig(MM_CFG).enableUBReuse && !ToMatmulConfig(MM_CFG).enableL1CacheUB) {
                        event_t eventIDMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
                        SetFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
                        WaitFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
                    } else if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
                        if ((var.tiling_.GetDepthAL1CacheUB() == 0 && A_TYPE::format == CubeFormat::ND) ||
                            (var.tiling_.GetDepthBL1CacheUB() == 0 && B_TYPE::format == CubeFormat::ND)) {
                            event_t eventIDMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
                            SetFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
                            WaitFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
                        }
                    }
#endif
                }
            }
            End();
        }

#if __CCE_AICORE__ >= 220
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias &&
            (ToMatmulConfig(MM_CFG).batchMode != BatchMode::BATCH_LARGE_THAN_L1 ||
            ToMatmulConfig(MM_CFG).isBiasBatch)) {
            if (var.tiling_.IsBias()) {
                var.qidBias_.FreeTensor(var.cacheHeadBias_);
                var.qidBias_.FreeAllEvent();
            }
        }
#endif
        MATMUL_MODULE(BatchCopyCubeInA)->BatchDestroy();
        MATMUL_MODULE(BatchCopyCubeInB)->BatchDestroy();
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeBatch(
    const LocalTensor<DstT>& dst, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite,
    const uint32_t matrixStrideA, const uint32_t matrixStrideB, const int32_t batchOuterIdx)
{
#if __CCE_AICORE__ == 200
    GlobalTensor<uint64_t> global;
    global.SetGlobalBuffer((__gm__ uint64_t*)0);
    DataCacheCleanAndInvalid<uint64_t, CacheLine::ENTIRE_DATA_CACHE>(global);
#endif
    // Check that the total amount of data to be transferred is less than L1.
    ASSERT((batchA_ * var.tiling_.GetSingleCoreM() * var.tiling_.GetSingleCoreK() + batchB_ * var.tiling_.GetSingleCoreN() *
        var.tiling_.GetSingleCoreK()) * sizeof(SrcT) <= TOTAL_L1_SIZE);
    if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) {
        int32_t batchNum = batchA_ > batchB_ ? batchA_ : batchB_;
        int32_t splitSize = (batchNum >= 2) && (batchA_ % 2 == 0) && (batchB_ % 2 == 0)? 2 : 1;
        int32_t splitBatchNum = batchNum / splitSize;
        auto matrixA = MATMUL_MODULE(BatchCopyCubeInA)->AllocTensor();
        auto matrixB = MATMUL_MODULE(BatchCopyCubeInB)->AllocTensor();
#if __CCE_AICORE__ >= 220
        // Transfer the batchNum Bias matrix to L1 at one time.
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias &&
            (ToMatmulConfig(MM_CFG).batchMode != BatchMode::BATCH_LARGE_THAN_L1 ||
            ToMatmulConfig(MM_CFG).isBiasBatch)) {
            LoadBatchBiasToL1(batchOuterIdx);
        }
#endif
        event_t eventIDMte2ToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1));
        event_t eventIDMToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::M_MTE1));
        for (int32_t outer = 0; outer < splitSize; ++outer) {
            MATMUL_MODULE(BatchCopyCubeInA)->BatchLoad(matrixA, matrixStrideA, batchOuterIdx, outer, splitSize);
            MATMUL_MODULE(BatchCopyCubeInB)->BatchLoad(matrixB, matrixStrideB, batchOuterIdx, outer, splitSize);
            SetFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            WaitFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            ASSERT(batchA_ > 0 && batchB_ > 0 && (batchA_ % batchB_ == 0 || batchB_ % batchA_ == 0));
            for (int32_t iBatch = 0; (iBatch < splitBatchNum) && (outer * splitBatchNum < batchNum); ++iBatch) {
                // Set the start address on L1 for each batch calculation.
                // SetTensorA()/SetTensorB()/SetBias()/SetTail()/SetQuantVector()
                if constexpr (ToMatmulConfig(MM_CFG).singleCoreM != 0 && ToMatmulConfig(MM_CFG).singleCoreN != 0 &&
                    ToMatmulConfig(MM_CFG).singleCoreK != 0) {
                    int32_t offsetA = GetBatchIterateAOffsetConstant(batchNum, iBatch, outer, splitSize);
                    var.leftMatrix_.address_ = matrixA[offsetA].address_;
                    int32_t offsetB = GetBatchIterateBOffsetConstant(batchNum, iBatch, outer, splitSize);
                    var.rightMatrix_.address_ = matrixB[offsetB].address_;
                    UpdateBatchIterateInfoConstant(batchNum, iBatch, outer, splitSize);
                } else {
                    int32_t offsetA = GetBatchIterateAOffset(batchNum, iBatch, outer, splitSize);
                    var.leftMatrix_.address_ = matrixA[offsetA].address_;
                    int32_t offsetB = GetBatchIterateBOffset(batchNum, iBatch, outer, splitSize);
                    var.rightMatrix_.address_ = matrixB[offsetB].address_;
                    UpdateBatchIterateInfo(batchNum, iBatch, outer, splitSize);
                }
                while (Iterate(enPartialSum)) {
                    // GetensorC
                    GetTensorCForBatch(dst, iBatch + outer * splitBatchNum, enAtomic, enSequentialWrite);
                    SetFlag<HardEvent::M_MTE1>(eventIDMToMte1);
                    WaitFlag<HardEvent::M_MTE1>(eventIDMToMte1);
#if __CCE_AICORE__ == 200
                    event_t eventIDVToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2));
                    SetFlag<HardEvent::V_MTE2>(eventIDVToMte2);
                    WaitFlag<HardEvent::V_MTE2>(eventIDVToMte2);
#endif
                }
            }
            End();
        }

#if __CCE_AICORE__ >= 220
        if constexpr (ToMatmulConfig(MM_CFG).enableSetBias &&
            (ToMatmulConfig(MM_CFG).batchMode != BatchMode::BATCH_LARGE_THAN_L1 ||
            ToMatmulConfig(MM_CFG).isBiasBatch)) {
            if (var.tiling_.IsBias()) {
                var.qidBias_.FreeTensor(var.cacheHeadBias_);
                var.qidBias_.FreeAllEvent();
            }
        }
#endif
        MATMUL_MODULE(BatchCopyCubeInA)->BatchDestroy();
        MATMUL_MODULE(BatchCopyCubeInB)->BatchDestroy();
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateBatch(
    const GlobalTensor<DstT>& gm, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite,
    const uint32_t matrixStrideA, const uint32_t matrixStrideB, const uint32_t matrixStrideC)
{
    MATMUL_MODULE(BatchScheduler)->Schedule(gm, enPartialSum, enAtomic, enSequentialWrite, matrixStrideA, matrixStrideB,
        matrixStrideC);
}

#endif

#if __CCE_AICORE__ < 220
// v100, v200
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateAll(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite, bool waitIterateAll, bool fakeMsg)
{
#if __CCE_AICORE__ == 200
    GlobalTensor<uint64_t> global;
    global.SetGlobalBuffer((__gm__ uint64_t*)0);
    DataCacheCleanAndInvalid<uint64_t, CacheLine::ENTIRE_DATA_CACHE>(global);
#endif
    while (Iterate()) {
        GetTensorCImpl(gm, enAtomic);
        if constexpr (ToMatmulConfig(MM_CFG).enableUBReuse && !ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            event_t eventIDMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
            SetFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
            WaitFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
        } else if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            if ((var.tiling_.GetDepthAL1CacheUB() == 0 && A_TYPE::format == CubeFormat::ND) ||
                (var.tiling_.GetDepthBL1CacheUB() == 0 && B_TYPE::format == CubeFormat::ND)) {
                event_t eventIDMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
                SetFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
                WaitFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
            }
        }
    }
}

// v100, v200
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateAll(
    const LocalTensor<DstT>& ubCmatrix, uint8_t enAtomic)
{
#if __CCE_AICORE__ == 200
    GlobalTensor<uint64_t> global;
    global.SetGlobalBuffer((__gm__ uint64_t*)0);
    DataCacheCleanAndInvalid<uint64_t, CacheLine::ENTIRE_DATA_CACHE>(global);
#endif
    (void)(enAtomic);
    while (Iterate()) {
        GetTensorCImpl(ubCmatrix);
        event_t eventIDVToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2));
        SetFlag<HardEvent::V_MTE2>(eventIDVToMte2);
        WaitFlag<HardEvent::V_MTE2>(eventIDVToMte2);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateBatch(
    const LocalTensor<DstT>& ubCmatrix, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite,
    const uint32_t matrixStrideA, const uint32_t matrixStrideB, const uint32_t matrixStrideC)
{
    MATMUL_MODULE(BatchScheduler)->Schedule(ubCmatrix, enPartialSum, enAtomic, enSequentialWrite, matrixStrideA,
        matrixStrideB, matrixStrideC);
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
    MATMUL_POLICY>::GetBatchIterateAOffsetConstant(const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    return 0;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
    MATMUL_POLICY>::GetBatchIterateBOffsetConstant(const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    return 0;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
    MATMUL_POLICY>::UpdateBatchIterateInfoConstant(const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::LoadBatchBiasToL1(
    const int32_t batchOuterIdx)
{}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCByLayout(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite, const uint32_t ndGapOffsetIn,
    const uint32_t mdGapOffsetIn)
{}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::FixpipeL0CToGm(
    const GlobalTensor<DstT> &gm, const LocalTensor<L0cT> &co1Local, int curM, int curN, uint8_t enAtomic, bool enSequentialWrite)
{}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::FixpipeOutToGm(
    const GlobalTensor<DstT>& gm, const LocalTensor<L0cT> &co1Local, int curM, int curN, uint8_t enAtomic,
    bool enSequentialWrite)
{}

#else
// v220
template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateAll(
    const GlobalTensor<DstT>& gm, uint8_t enAtomic, bool enSequentialWrite, bool waitIterateAll, bool fakeMsg)
{
    if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
        intraBlockMatmul.fakeMsg = fakeMsg;
        MATMUL_MODULE(Scheduler)->Schedule(gm, enAtomic, enSequentialWrite, fakeMsg);
    } else {
        while (Iterate()) {
            GetTensorCImpl(gm, enAtomic);
        }
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
template <bool sync>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateAll(
    const LocalTensor<DstT>& gm, uint8_t enAtomic)
{
    while (Iterate()) {
        GetTensorCImpl(gm, enAtomic);
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::IterateBatch(
    const LocalTensor<DstT>& ubCmatrix, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite,
    const uint32_t matrixStrideA, const uint32_t matrixStrideB, const uint32_t matrixStrideC)
{}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
    MATMUL_POLICY>::GetBatchIterateAOffsetConstant(const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    int32_t tmpBatchIdx = batchIdx + splitOuterIdx * batchNum / splitSize;
    if (var.tiling_.GetALayoutInfoG() == 1 && var.tiling_.GetBLayoutInfoG() != 1) { // BRC for Gaxis
        ASSERT(var.tiling_.GetBLayoutInfoG() > 0);
        ASSERT(var.tiling_.GetALayoutInfoN() == var.tiling_.GetBLayoutInfoN());
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        tmpBatchIdx = tmpBatchIdx / var.tiling_.GetBLayoutInfoG();
    } else if (var.tiling_.GetALayoutInfoN() == 1 && var.tiling_.GetBLayoutInfoN() != 1) {
        // BRC for N axis = idx % BLayoutInfoG + idx / (BLayoutInfoG * BLayoutInfoN)
        ASSERT(var.tiling_.GetBLayoutInfoN() > 0);
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG());
        tmpBatchIdx = tmpBatchIdx % var.tiling_.GetBLayoutInfoG() +
            tmpBatchIdx / ( var.tiling_.GetBLayoutInfoG() * var.tiling_.GetBLayoutInfoN());
    } else if (var.tiling_.GetALayoutInfoB() == 1 && var.tiling_.GetBLayoutInfoB() != 1 &&
        A_TYPE::layout != LayoutMode::NORMAL) { // BRC for B axis
        ASSERT(var.tiling_.GetBLayoutInfoB() > 0);
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG()); // multi axis BRC is not supported.
        tmpBatchIdx = tmpBatchIdx % (var.tiling_.GetBLayoutInfoG() * var.tiling_.GetBLayoutInfoN()) + tmpBatchIdx /
            (var.tiling_.GetBLayoutInfoG() * var.tiling_.GetBLayoutInfoN() * var.tiling_.GetBLayoutInfoB());
    }
    if constexpr (A_TYPE::layout == LayoutMode::NORMAL) {
        tmpBatchIdx = tmpBatchIdx / (batchNum / batchA_);
    }
    if constexpr (A_TYPE::isTrans) {
        int32_t alignM = Ceil(ToMatmulConfig(MM_CFG).singleCoreM, c0Size_) * c0Size_;
        int32_t alignSize = BLOCK_CUBE;
        if constexpr (IsSameType<SrcT, int8_t>::value) {
            alignSize = c0Size_;
        }
        int32_t alignK = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, alignSize) * alignSize;
        return alignM * alignK * tmpBatchIdx;
    } else {
        int32_t alignM = Ceil(ToMatmulConfig(MM_CFG).singleCoreM, BLOCK_CUBE) * BLOCK_CUBE;
        int32_t alignK = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, c0Size_) * c0Size_;
        return alignM * alignK * tmpBatchIdx;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline int32_t MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
    MATMUL_POLICY>::GetBatchIterateBOffsetConstant(const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    int32_t tmpBatchIdx = batchIdx + splitOuterIdx * batchNum / splitSize;
    if (var.tiling_.GetBLayoutInfoG() == 1 && var.tiling_.GetALayoutInfoG() != 1) { // BRC for Gaxis
        ASSERT(var.tiling_.GetALayoutInfoG() > 0);
        ASSERT(var.tiling_.GetALayoutInfoN() == var.tiling_.GetBLayoutInfoN());
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        tmpBatchIdx = tmpBatchIdx / var.tiling_.GetALayoutInfoG();
    } else if (var.tiling_.GetBLayoutInfoN() == 1 && var.tiling_.GetALayoutInfoN() != 1) {
        // BRC for GN axis = idx % BLayoutInfoG + idx / (BLayoutInfoG * BLayoutInfoN)
        ASSERT(var.tiling_.GetALayoutInfoN() > 0);
        ASSERT(var.tiling_.GetALayoutInfoB() == var.tiling_.GetBLayoutInfoB());
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG());
        tmpBatchIdx = tmpBatchIdx % var.tiling_.GetALayoutInfoG() +
            tmpBatchIdx / ( var.tiling_.GetALayoutInfoG() * var.tiling_.GetALayoutInfoN());
    } else if (var.tiling_.GetBLayoutInfoB() == 1 && var.tiling_.GetALayoutInfoB() != 1) { // BRC for B axis
        ASSERT(var.tiling_.GetALayoutInfoB() > 0);
        ASSERT(var.tiling_.GetALayoutInfoN() == var.tiling_.GetBLayoutInfoN());
        ASSERT(var.tiling_.GetALayoutInfoG() == var.tiling_.GetBLayoutInfoG()); // multi axis BRC is not supported.
        tmpBatchIdx = tmpBatchIdx % (var.tiling_.GetALayoutInfoG() * var.tiling_.GetALayoutInfoN()) + tmpBatchIdx /
            (var.tiling_.GetALayoutInfoG() * var.tiling_.GetALayoutInfoN() * var.tiling_.GetALayoutInfoB());
    }
    if constexpr (A_TYPE::layout == LayoutMode::NORMAL) {
        tmpBatchIdx = tmpBatchIdx / (batchNum / batchB_);
    }
    if constexpr (B_TYPE::isTrans) {
        int32_t alignN = Ceil(ToMatmulConfig(MM_CFG).singleCoreN, BLOCK_CUBE) * BLOCK_CUBE;
        int32_t alignK = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, c0Size_) * c0Size_;
        return alignN * alignK * tmpBatchIdx;
    } else {
        constexpr int32_t alignSize = IsSameType<SrcT, int8_t>::value ? c0Size_ : BLOCK_CUBE;
        int32_t alignN = Ceil(ToMatmulConfig(MM_CFG).singleCoreN, c0Size_) * c0Size_;
        int32_t alignK = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, alignSize) * alignSize;
        return alignN * alignK * tmpBatchIdx;
    }
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
    MATMUL_POLICY>::UpdateBatchIterateInfoConstant(const int32_t batchNum, const int32_t batchIdx,
    const int32_t splitOuterIdx, const int32_t splitSize)
{
    // Support BRC on the BNG axis of the AB matrix.
#ifdef ASCENDC_CPU_DEBUG
    int32_t dividendA1;
    int32_t dividendA2;
    int32_t dividendB1;
    int32_t dividendB2;
    if constexpr (IsStaticPaddingEnable(MM_CFG)) {
        dividendA1 = Ceil(ToMatmulConfig(MM_CFG).singleCoreM, var.tiling_.GetBaseM()) * var.tiling_.GetBaseM();
        dividendA2 = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, var.tiling_.GetBaseK()) * var.tiling_.GetBaseK();
        dividendB1 = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, var.tiling_.GetBaseK()) * var.tiling_.GetBaseK();
        dividendB2 = Ceil(ToMatmulConfig(MM_CFG).singleCoreN, var.tiling_.GetBaseN()) * var.tiling_.GetBaseN();
    } else {
        dividendA1 = ToMatmulConfig(MM_CFG).singleCoreM;
        dividendA2 = ToMatmulConfig(MM_CFG).singleCoreK;
        dividendB1 = ToMatmulConfig(MM_CFG).singleCoreK;
        dividendB2 = ToMatmulConfig(MM_CFG).singleCoreN;
    }
    int aMatrixSingleBatchSize;
    if constexpr (A_TYPE::isTrans) {
        if constexpr (IsSameType<SrcT, int8_t>::value) {
            aMatrixSingleBatchSize =
                Ceil(dividendA1, c0Size_) * c0Size_ * Ceil(dividendA2, c0Size_) * c0Size_ * sizeof(SrcT);
        } else {
            aMatrixSingleBatchSize =
                Ceil(dividendA1, c0Size_) * c0Size_ * Ceil(dividendA2, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcT);
        }
    } else {
        aMatrixSingleBatchSize =
            Ceil(dividendA1, BLOCK_CUBE) * BLOCK_CUBE * Ceil(dividendA2, c0Size_) * c0Size_ * sizeof(SrcT);
    }

    int bMatrixSingleBatchSize;
    if constexpr (B_TYPE::isTrans) {
        bMatrixSingleBatchSize =
            Ceil(dividendB1, c0Size_) * c0Size_ * Ceil(dividendB2, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcT);
    } else {
        if constexpr (IsSameType<SrcT, int8_t>::value) {
            bMatrixSingleBatchSize =
                Ceil(dividendB1, c0Size_) * c0Size_ * Ceil(dividendB2, c0Size_) * c0Size_ * sizeof(SrcT);
        } else {
            bMatrixSingleBatchSize =
                Ceil(dividendB1, BLOCK_CUBE) * BLOCK_CUBE * Ceil(dividendB2, c0Size_) * c0Size_ * sizeof(SrcT);
        }
    }
    var.leftMatrix_.address_.dataLen = aMatrixSingleBatchSize;
    var.rightMatrix_.address_.dataLen = bMatrixSingleBatchSize;
#endif
    if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
        if (var.enableBias_) {
            int32_t offsetBias =
                GetBatchIterateBiasOffset(batchNum, batchIdx, var.enableBias_, splitOuterIdx, splitSize);
            var.inputBias_.address_ = var.cacheHeadBias_[offsetBias].address_;
        }
    }
    if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
        MatmulQuantProcessor::UpdateQuantTensor(var.singleCoreN_);
    }
    var.isFirstIter_ = true;
}

template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::LoadBatchBiasToL1(
    const int32_t batchOuterIdx)
{
    int32_t batchNum = batchA_ > batchB_ ? batchA_ : batchB_;
    if constexpr (!PhyPosIsL1(BIAS_TYPE::pos)) {
        if (var.enableBias_) {
            var.cacheHeadBias_ = var.qidBias_.template AllocTensor<BiasT>();
            GlobalTensor<BiasT> biasGlobal;
            biasGlobal.SetGlobalBuffer(var.biasGlobal_);

            if constexpr (!ToMatmulConfig(MM_CFG).isBiasBatch) {
                auto blockLen = Ceil(var.tiling_.GetSingleCoreN(), AscendCUtils::GetC0Count(sizeof(BiasT)));
                DataCopy(var.cacheHeadBias_, biasGlobal, { (uint16_t)1,
                    static_cast<uint16_t>(blockLen), (uint16_t)0, (uint16_t)0 });
            } else {
                biasGlobal.SetAddr(batchOuterIdx * batchNum * var.singleCoreN_);
                for (auto i = 0; i < batchNum; ++i) {
                    DataCopy(var.cacheHeadBias_[i * CeilAlignNum(var.tiling_.GetSingleCoreN(),
                                                                 AscendCUtils::GetC0Count(sizeof(BiasT)))],
                             biasGlobal[i * var.tiling_.GetSingleCoreN()],
                             { 1, 1, (uint16_t)(var.tiling_.GetSingleCoreN()), 0, 1, 1, 1, 0 });
                }
            }
            // delete after tpipe supports bias queue
            event_t eventIDMte2ToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1));
            SetFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            WaitFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
        }
    }
}


template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto &MM_CFG, class MM_CB,
    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
__aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::GetTensorCByLayout(
    const GlobalTensor<DstT> &gm, uint8_t enAtomic, bool enSequentialWrite, const uint32_t nGapOffsetIn,
    const uint32_t mGapOffsetIn)
{
    auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor();
    MATMUL_MODULE(CubeOutBuffer)->EnQue(co1Local);
    MATMUL_MODULE(CubeOutBuffer)->DeQue();
    if (enAtomic == 1) {
        SetAtomicAdd<DstT>();
    }

    if (enSequentialWrite) {
        MATMUL_MODULE(CopyCubeOut)
            ->template Copy<true>(gm, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_, var.blockUseM_,
                                     var.blockUseN_);
    } else {
        MATMUL_MODULE(CopyCubeOut)
            ->template Copy<false>(gm, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_, var.blockUseM_,
                                     var.blockUseN_);
    }

    if (enAtomic != 0) {
        SetAtomicNone();
    }
    MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local);
}

#endif
} // namespace AscendC
#endif
